Index: head/sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- head/sys/dev/cxgbe/tom/t4_ddp.c	(revision 367333)
+++ head/sys/dev/cxgbe/tom/t4_ddp.c	(revision 367334)
@@ -1,1943 +1,1943 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * received by the AIO job so far.
  */
 #define	aio_received	backend3
 
 static void aio_ddp_requeue_task(void *context, int pending);
 static void ddp_complete_all(struct toepcb *toep, int error);
 static void t4_aio_cancel_active(struct kaiocb *job);
 static void t4_aio_cancel_queued(struct kaiocb *job);
 
 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
 static struct mtx ddp_orphan_pagesets_lock;
 static struct task ddp_orphan_task;
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
 
 /*
  * A page set holds information about a buffer used for DDP.  The page
  * set holds resources such as the VM pages backing the buffer (either
  * held or wired) and the page pods associated with the buffer.
  * Recently used page sets are cached to allow for efficient reuse of
  * buffers (avoiding the need to re-fault in pages, hold them, etc.).
  * Note that cached page sets keep the backing pages wired.  The
  * number of wired pages is capped by only allowing for two wired
  * pagesets per connection.  This is not a perfect cap, but is a
  * trade-off for performance.
  *
  * If an application ping-pongs two buffers for a connection via
  * aio_read(2) then those buffers should remain wired and expensive VM
  * fault lookups should be avoided after each buffer has been used
  * once.  If an application uses more than two buffers then this will
  * fall back to doing expensive VM fault lookups for each operation.
  */
 static void
 free_pageset(struct tom_data *td, struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	if (ps->prsv.prsv_nppods > 0)
 		t4_free_page_pods(&ps->prsv);
 
 	for (i = 0; i < ps->npages; i++) {
 		p = ps->pages[i];
 		vm_page_unwire(p, PQ_INACTIVE);
 	}
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 ddp_free_orphan_pagesets(void *context, int pending)
 {
 	struct pageset *ps;
 
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
 		mtx_unlock(&ddp_orphan_pagesets_lock);
 		if (ps->vm)
 			vmspace_free(ps->vm);
 		free(ps, M_CXGBE);
 		mtx_lock(&ddp_orphan_pagesets_lock);
 	}
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 recycle_pageset(struct toepcb *toep, struct pageset *ps)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD)) {
 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
 		    nitems(toep->ddp.db), ("too many wired pagesets"));
 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count++;
 	} else
 		free_pageset(toep->td, ps);
 }
 
 static void
 ddp_complete_one(struct kaiocb *job, int error)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0 || error == 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_complete(job, -1, error);
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db->job) {
 		/*
 		 * XXX: If we are un-offloading the socket then we
 		 * should requeue these on the socket somehow.  If we
 		 * got a FIN from the remote end, then this completes
 		 * any remaining requests with an EOF read.
 		 */
 		if (!aio_clear_cancel_function(db->job))
 			ddp_complete_one(db->job, 0);
 	}
 
 	if (db->ps)
 		free_pageset(td, db->ps);
 }
 
 void
 ddp_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->ddp.aiojobq);
 	TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep);
 	toep->ddp.flags = DDP_OK;
 	toep->ddp.active_id = -1;
 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
 }
 
 void
 ddp_uninit_toep(struct toepcb *toep)
 {
 
 	mtx_destroy(&toep->ddp.lock);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	struct pageset *ps;
 	int i;
 
 	DDP_LOCK(toep);
 	toep->ddp.flags |= DDP_DEAD;
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		free_ddp_buffer(toep->td, &toep->ddp.db[i]);
 	}
 	while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		free_pageset(toep->td, ps);
 	}
 	ddp_complete_all(toep, 0);
 	DDP_UNLOCK(toep);
 }
 
 #ifdef INVARIANTS
 void
 ddp_assert_empty(struct toepcb *toep)
 {
 	int i;
 
 	MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE));
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		MPASS(toep->ddp.db[i].job == NULL);
 		MPASS(toep->ddp.db[i].ps == NULL);
 	}
 	MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
 	MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
 }
 #endif
 
 static void
 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
     unsigned int db_idx)
 {
 	unsigned int db_flag;
 
 	toep->ddp.active_count--;
 	if (toep->ddp.active_id == db_idx) {
 		if (toep->ddp.active_count == 0) {
 			KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
 			    ("%s: active_count mismatch", __func__));
 			toep->ddp.active_id = -1;
 		} else
 			toep->ddp.active_id ^= 1;
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__,
 		    toep->tid, toep->ddp.active_id);
 #endif
 	} else {
 		KASSERT(toep->ddp.active_count != 0 &&
 		    toep->ddp.active_id != -1,
 		    ("%s: active count mismatch", __func__));
 	}
 
 	db->cancel_pending = 0;
 	db->job = NULL;
 	recycle_pageset(toep, db->ps);
 	db->ps = NULL;
 
 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 	KASSERT(toep->ddp.flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 	toep->ddp.flags &= ~db_flag;
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	size_t placed;
 	long copied;
 	unsigned int db_flag, db_idx;
 
 	INP_WLOCK_ASSERT(inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = n;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else if (copied + placed != 0) {
 			CTR4(KTR_CXGBE,
 			    "%s: completing %p (copied %ld, placed %lu)",
 			    __func__, job, copied, placed);
 			/* XXX: This always completes if there is some data. */
 			aio_complete(job, copied + placed, 0);
 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 			toep->ddp.waiting_count++;
 		} else
 			aio_cancel(job);
 		n -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(n == 0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
 {
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 	    V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(ps->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    ddp_flags_mask, ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct kaiocb *job;
 	long copied;
 
 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	DDP_LOCK(toep);
 
 	KASSERT(toep->ddp.active_id == db_idx,
 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
 	    toep->ddp.active_id, toep->tid));
 	db = &toep->ddp.db[db_idx];
 	job = db->job;
 
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 		/*
 		 * This can happen due to an administrative tcpdrop(8).
 		 * Just fail the request with ECONNRESET.
 		 */
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, ECONNRESET);
 		goto completed;
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
 	 * sequence number of the next byte to receive.  The length of
 	 * the data received for this message must be computed by
 	 * comparing the new and old values of rcv_nxt.
 	 *
 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
 	 * length of the most recent DMA.  It does not include the
 	 * total length of the data received since the previous update
 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
 	 * first received byte from the most recent DMA.
 	 */
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
 	    toep->tid, db_idx, len, report);
 #endif
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		struct adapter *sc = td_adapter(toep->td);
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
 		 * Update the job's length but defer completion to the
 		 * TCB_RPL callback.
 		 */
 		job->aio_received += len;
 		goto out;
 	} else if (!aio_clear_cancel_function(job)) {
 		/*
 		 * Update the copied length for when
 		 * t4_aio_cancel_active() completes this request.
 		 */
 		job->aio_received += len;
 	} else {
 		copied = job->aio_received;
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE,
 		    "%s: tid %u, completing %p (copied %ld, placed %d)",
 		    __func__, toep->tid, job, copied, len);
 #endif
 		aio_complete(job, copied + len, 0);
 		t4_rcvd(&toep->td->tod, tp);
 	}
 
 completed:
 	complete_ddp_buffer(toep, db, db_idx);
 	if (toep->ddp.waiting_count > 0)
 		ddp_queue_toep(toep);
 out:
 	DDP_UNLOCK(toep);
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 handle_ddp_indicate(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	MPASS(toep->ddp.active_count == 0);
 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
 	if (toep->ddp.waiting_count == 0) {
 		/*
 		 * The pending requests that triggered the request for an
 		 * an indicate were cancelled.  Those cancels should have
 		 * already disabled DDP.  Just ignore this as the data is
 		 * going into the socket buffer anyway.
 		 */
 		return;
 	}
 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
 	    toep->tid, toep->ddp.waiting_count);
 	ddp_queue_toep(toep);
 }
 
 enum {
 	DDP_BUF0_INVALIDATED = 0x2,
 	DDP_BUF1_INVALIDATED
 };
 
 CTASSERT(DDP_BUF0_INVALIDATED == CPL_COOKIE_DDP0);
 
 static int
 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	unsigned int db_idx;
 	struct toepcb *toep;
 	struct inpcb *inp;
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 
 	if (cpl->status != CPL_ERR_NONE)
 		panic("XXX: tcp_rpl failed: %d", cpl->status);
 
 	toep = lookup_tid(sc, tid);
 	inp = toep->inp;
 	switch (cpl->cookie) {
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED):
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED):
 		/*
 		 * XXX: This duplicates a lot of code with handle_ddp_data().
 		 */
 		db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED;
 		MPASS(db_idx < nitems(toep->ddp.db));
 		INP_WLOCK(inp);
 		DDP_LOCK(toep);
 		db = &toep->ddp.db[db_idx];
 
 		/*
 		 * handle_ddp_data() should leave the job around until
 		 * this callback runs once a cancel is pending.
 		 */
 		MPASS(db != NULL);
 		MPASS(db->job != NULL);
 		MPASS(db->cancel_pending);
 
 		/*
 		 * XXX: It's not clear what happens if there is data
 		 * placed when the buffer is invalidated.  I suspect we
 		 * need to read the TCB to see how much data was placed.
 		 *
 		 * For now this just pretends like nothing was placed.
 		 *
 		 * XXX: Note that if we did check the PCB we would need to
 		 * also take care of updating the tp, etc.
 		 */
 		job = db->job;
 		copied = job->aio_received;
 		if (copied == 0) {
 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
 			aio_cancel(job);
 		} else {
 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
 			    __func__, job, copied);
 			aio_complete(job, copied, 0);
 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
 		}
 
 		complete_ddp_buffer(toep, db, db_idx);
 		if (toep->ddp.waiting_count > 0)
 			ddp_queue_toep(toep);
 		DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 		break;
 	default:
 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
 	}
 
 	return (0);
 }
 
 void
 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
 {
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 	unsigned int db_flag, db_idx;
 	int len, placed;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = len;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else {
 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
 			    __func__, toep->tid, db_idx, placed);
 			aio_complete(job, copied + placed, 0);
 		}
 		len -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(len == 0);
 	ddp_complete_all(toep, 0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 extern cpl_handler_t t4_cpl_handler[];
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
 		return (0);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 static void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	DDP_ASSERT_LOCKED(toep);
 	toep->ddp.flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
 static inline int
 pages_to_nppods(int npages, int ddp_page_shift)
 {
 
 	MPASS(ddp_page_shift >= PAGE_SHIFT);
 
 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
 }
 
 static int
 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
     struct ppod_reservation *prsv)
 {
 	vmem_addr_t addr;       /* relative to start of region */
 
 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (ENOMEM);
 
 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
 
 	/*
 	 * The hardware tagmask includes an extra invalid bit but the arena was
 	 * seeded with valid values only.  An allocation out of this arena will
 	 * fit inside the tagmask but won't have the invalid bit set.
 	 */
 	MPASS((addr & pr->pr_tag_mask) == addr);
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	prsv->prsv_pr = pr;
 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
 	prsv->prsv_nppods = nppods;
 
 	return (0);
 }
 
 int
 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
 {
 	int i, hcf, seglen, idx, nppods;
 	struct ppod_reservation *prsv = &ps->prsv;
 
 	KASSERT(prsv->prsv_nppods == 0,
 	    ("%s: page pods already allocated", __func__));
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < ps->npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < ps->npages - 1 &&
 		    ps->pages[i]->phys_addr + PAGE_SIZE ==
 		    ps->pages[i + 1]->phys_addr) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (0);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (1);
 }
 
 int
 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
     struct ppod_reservation *prsv)
 {
 	int hcf, seglen, idx, npages, nppods;
 	uintptr_t start_pva, end_pva, pva, p1;
 
 	MPASS(buf > 0);
 	MPASS(len > 0);
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
 	 * in the page list.
 	 */
 	hcf = 0;
 	start_pva = trunc_page(buf);
 	end_pva = trunc_page(buf + len - 1);
 	pva = start_pva;
 	while (pva <= end_pva) {
 		seglen = PAGE_SIZE;
 		p1 = pmap_kextract(pva);
 		pva += PAGE_SIZE;
 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
 			seglen += PAGE_SIZE;
 			pva += PAGE_SIZE;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	npages = 1;
 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
 	nppods = howmany(npages, PPOD_PAGES);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (0);
 }
 
 void
 t4_free_page_pods(struct ppod_reservation *prsv)
 {
 	struct ppod_region *pr = prsv->prsv_pr;
 	vmem_addr_t addr;
 
 	MPASS(prsv != NULL);
 	MPASS(prsv->prsv_nppods != 0);
 
 	addr = prsv->prsv_tag & pr->pr_tag_mask;
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
 	    pr->pr_arena, addr, prsv->prsv_nppods);
 
 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
 	prsv->prsv_nppods = 0;
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
 int
 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct pageset *ps)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
 	struct ppod_reservation *prsv = &ps->prsv;
 	struct ppod_region *pr = prsv->prsv_pr;
 
 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
 	    ("%s: page pods already written", __func__));
 	MPASS(prsv->prsv_nppods > 0);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) | prsv->prsv_tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
 			    V_PPOD_OFST(ps->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < ps->npages) {
 					ppod->addr[k] =
 					    htobe64(ps->pages[idx]->phys_addr);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 	ps->flags |= PS_PPODS_WRITTEN;
 
 	return (0);
 }
 
 int
 t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz;
 	u_int ppod_addr, offset;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva, pa;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	offset = buf & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
 			    V_PPOD_OFST(offset));
 			ppod->rsvd = 0;
 
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (pva > end_pva)
 					ppod->addr[k] = 0;
 				else {
 					pa = pmap_kextract(pva);
 					ppod->addr[k] = htobe64(pa);
 					pva += ddp_pgsz;
 				}
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 			/*
 			 * Walk back 1 segment so that the first address in the
 			 * next pod is the same as the last one in the current
 			 * pod.
 			 */
 			pva -= ddp_pgsz;
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 
 	MPASS(pva <= end_pva);
 
 	return (0);
 }
 
 /*
  * Prepare a pageset for DDP.  This sets up page pods.
  */
 static int
 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
 {
 	struct tom_data *td = sc->tom_softc;
 
 	if (ps->prsv.prsv_nppods == 0 &&
 	    !t4_alloc_page_pods_for_ps(&td->pr, ps)) {
 		return (0);
 	}
 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
     const char *name)
 {
 	int i;
 
 	MPASS(pr != NULL);
 	MPASS(r->size > 0);
 
 	pr->pr_start = r->start;
 	pr->pr_len = r->size;
 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
 
 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
 			return (ENXIO);
 	}
 
 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
 		return (ENXIO);
 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
 
 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
 	    M_FIRSTFIT | M_NOWAIT);
 	if (pr->pr_arena == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 void
 t4_free_ppod_region(struct ppod_region *pr)
 {
 
 	MPASS(pr != NULL);
 
 	if (pr->pr_arena)
 		vmem_destroy(pr->pr_arena);
 	bzero(pr, sizeof(*pr));
 }
 
 static int
 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
     int pgoff, int len)
 {
 
 	if (ps->start != start || ps->npages != npages ||
 	    ps->offset != pgoff || ps->len != len)
 		return (1);
 
 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
 }
 
 static int
 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	struct pageset *ps;
 	int n;
 
 	DDP_ASSERT_LOCKED(toep);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE) {
 		/*
 		 * Truncate the request to a short read.
 		 * Alternatively, we could DDP in chunks to the larger
 		 * buffer, but that would be quite a bit more work.
 		 *
 		 * When truncating, round the request down to avoid
 		 * crossing a cache line on the final transaction.
 		 */
 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
 		    (unsigned long)(end - (start + pgoff)));
 		job->uaiocb.aio_nbytes = end - (start + pgoff);
 #endif
 		end = round_page(end);
 	}
 
 	n = atop(end - start);
 
 	/*
 	 * Try to reuse a cached pageset.
 	 */
 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
 		if (pscmp(ps, vm, start, n, pgoff,
 		    job->uaiocb.aio_nbytes) == 0) {
 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 			toep->ddp.cached_count--;
 			*pps = ps;
 			return (0);
 		}
 	}
 
 	/*
 	 * If there are too many cached pagesets to create a new one,
 	 * free a pageset before creating a new one.
 	 */
 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
 	if (toep->ddp.active_count + toep->ddp.cached_count ==
 	    nitems(toep->ddp.db)) {
 		KASSERT(toep->ddp.cached_count > 0,
 		    ("no cached pageset to free"));
 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count--;
 		free_pageset(toep->td, ps);
 	}
 	DDP_UNLOCK(toep);
 
 	/* Create a new pageset. */
 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	ps->pages = (vm_page_t *)(ps + 1);
 	ps->vm_timestamp = map->timestamp;
 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ps->pages, n);
 
 	DDP_LOCK(toep);
 	if (ps->npages < 0) {
 		free(ps, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
 	    ps->npages, n));
 
 	ps->offset = pgoff;
 	ps->len = job->uaiocb.aio_nbytes;
-	atomic_add_int(&vm->vm_refcnt, 1);
+	refcount_acquire(&vm->vm_refcnt);
 	ps->vm = vm;
 	ps->start = start;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, toep->tid, ps, job, ps->npages);
 	*pps = ps;
 	return (0);
 }
 
 static void
 ddp_complete_all(struct toepcb *toep, int error)
 {
 	struct kaiocb *job;
 
 	DDP_ASSERT_LOCKED(toep);
 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, error);
 	}
 }
 
 static void
 aio_ddp_cancel_one(struct kaiocb *job)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_cancel(job);
 }
 
 /*
  * Called when the main loop wants to requeue a job to retry it later.
  * Deals with the race of the job being cancelled while it was being
  * examined.
  */
 static void
 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) &&
 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count++;
 	} else
 		aio_ddp_cancel_one(job);
 }
 
 static void
 aio_ddp_requeue(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct kaiocb *job;
 	struct ddp_buffer *db;
 	size_t copied, offset, resid;
 	struct pageset *ps;
 	struct mbuf *m;
 	uint64_t ddp_flags, ddp_flags_mask;
 	struct wrqe *wr;
 	int buf_flag, db_idx, error;
 
 	DDP_ASSERT_LOCKED(toep);
 
 restart:
 	if (toep->ddp.flags & DDP_DEAD) {
 		MPASS(toep->ddp.waiting_count == 0);
 		MPASS(toep->ddp.active_count == 0);
 		return;
 	}
 
 	if (toep->ddp.waiting_count == 0 ||
 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
 		return;
 	}
 
 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
 	so = job->fd_file->f_data;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	/* We will never get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		SOCKBUF_UNLOCK(sb);
 		ddp_complete_all(toep, ENOTCONN);
 		return;
 	}
 
 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
 	    ("%s: pending sockbuf data and DDP is active", __func__));
 
 	/* Abort if socket has reported problems. */
 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
 	if (so->so_error && sbavail(sb) == 0) {
 		toep->ddp.waiting_count--;
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		if (!aio_clear_cancel_function(job)) {
 			SOCKBUF_UNLOCK(sb);
 			goto restart;
 		}
 
 		/*
 		 * If this job has previously copied some data, report
 		 * a short read and leave the error to be reported by
 		 * a future request.
 		 */
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			aio_complete(job, copied, 0);
 			goto restart;
 		}
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		aio_complete(job, -1, error);
 		goto restart;
 	}
 
 	/*
 	 * Door is closed.  If there is pending data in the socket buffer,
 	 * deliver it.  If there are pending DDP requests, wait for those
 	 * to complete.  Once they have completed, return EOF reads.
 	 */
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ddp.active_count != 0)
 			return;
 		ddp_complete_all(toep, 0);
 		return;
 	}
 
 	/*
 	 * If DDP is not enabled and there is no pending socket buffer
 	 * data, try to enable DDP.
 	 */
 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
 		SOCKBUF_UNLOCK(sb);
 
 		/*
 		 * Wait for the card to ACK that DDP is enabled before
 		 * queueing any buffers.  Currently this waits for an
 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
 		 * message to know that DDP was enabled instead of waiting
 		 * for the indicate which would avoid copying the indicate
 		 * if no data is pending.
 		 *
 		 * XXX: Might want to limit the indicate size to the size
 		 * of the first queued request.
 		 */
 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
 			enable_ddp(sc, toep);
 		return;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/*
 	 * If another thread is queueing a buffer for DDP, let it
 	 * drain any work and return.
 	 */
 	if (toep->ddp.queueing != NULL)
 		return;
 
 	/* Take the next job to prep it for DDP. */
 	toep->ddp.waiting_count--;
 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 	if (!aio_clear_cancel_function(job))
 		goto restart;
 	toep->ddp.queueing = job;
 
 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
 	error = hold_aio(toep, job, &ps);
 	if (error != 0) {
 		ddp_complete_one(job, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_error && sbavail(sb) == 0) {
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		aio_complete(job, -1, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		if (toep->ddp.active_count != 0) {
 			/*
 			 * The door is closed, but there are still pending
 			 * DDP buffers.  Requeue.  These jobs will all be
 			 * completed once those buffers drain.
 			 */
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			return;
 		}
 		ddp_complete_one(job, 0);
 		ddp_complete_all(toep, 0);
 		toep->ddp.queueing = NULL;
 		return;
 	}
 
 sbcopy:
 	/*
 	 * If the toep is dead, there shouldn't be any data in the socket
 	 * buffer, so the above case should have handled this.
 	 */
 	MPASS(!(toep->ddp.flags & DDP_DEAD));
 
 	/*
 	 * If there is pending data in the socket buffer (either
 	 * from before the requests were queued or a DDP indicate),
 	 * copy those mbufs out directly.
 	 */
 	copied = 0;
 	offset = ps->offset + job->aio_received;
 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
 	resid = job->uaiocb.aio_nbytes - job->aio_received;
 	m = sb->sb_mb;
 	KASSERT(m == NULL || toep->ddp.active_count == 0,
 	    ("%s: sockbuf data with active DDP", __func__));
 	while (m != NULL && resid > 0) {
 		struct iovec iov[1];
 		struct uio uio;
 		int error;
 
 		iov[0].iov_base = mtod(m, void *);
 		iov[0].iov_len = m->m_len;
 		if (iov[0].iov_len > resid)
 			iov[0].iov_len = resid;
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = 0;
 		uio.uio_resid = iov[0].iov_len;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		error = uiomove_fromphys(ps->pages, offset + copied,
 		    uio.uio_resid, &uio);
 		MPASS(error == 0 && uio.uio_resid == 0);
 		copied += uio.uio_offset;
 		resid -= uio.uio_offset;
 		m = m->m_next;
 	}
 	if (copied != 0) {
 		sbdrop_locked(sb, copied);
 		job->aio_received += copied;
 		job->msgrcv = 1;
 		copied = job->aio_received;
 		inp = sotoinpcb(so);
 		if (!INP_TRY_WLOCK(inp)) {
 			/*
 			 * The reference on the socket file descriptor in
 			 * the AIO job should keep 'sb' and 'inp' stable.
 			 * Our caller has a reference on the 'toep' that
 			 * keeps it stable.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			DDP_UNLOCK(toep);
 			INP_WLOCK(inp);
 			DDP_LOCK(toep);
 			SOCKBUF_LOCK(sb);
 
 			/*
 			 * If the socket has been closed, we should detect
 			 * that and complete this request if needed on
 			 * the next trip around the loop.
 			 */
 		}
 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
 		INP_WUNLOCK(inp);
 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
 			/*
 			 * We filled the entire buffer with socket
 			 * data, DDP is not being used, or the socket
 			 * is being shut down, so complete the
 			 * request.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * If DDP is not enabled, requeue this request and restart.
 		 * This will either enable DDP or wait for more data to
 		 * arrive on the socket buffer.
 		 */
 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * An indicate might have arrived and been added to
 		 * the socket buffer while it was unlocked after the
 		 * copy to lock the INP.  If so, restart the copy.
 		 */
 		if (sbavail(sb) != 0)
 			goto sbcopy;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	if (prep_pageset(sc, toep, ps) == 0) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need to retry this later.  Mostly need a trigger
 		 * when page pods are freed up.
 		 */
 		printf("%s: prep_pageset failed\n", __func__);
 		return;
 	}
 
 	/* Determine which DDP buffer to use. */
 	if (toep->ddp.db[0].job == NULL) {
 		db_idx = 0;
 	} else {
 		MPASS(toep->ddp.db[1].job == NULL);
 		db_idx = 1;
 	}
 
 	ddp_flags = 0;
 	ddp_flags_mask = 0;
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
 		buf_flag = DDP_BUF0_ACTIVE;
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
 		buf_flag = DDP_BUF1_ACTIVE;
 	}
 	MPASS((toep->ddp.flags & buf_flag) == 0);
 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
 		MPASS(db_idx == 0);
 		MPASS(toep->ddp.active_id == -1);
 		MPASS(toep->ddp.active_count == 0);
 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
 	}
 
 	/*
 	 * The TID for this connection should still be valid.  If DDP_DEAD
 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
 	 * this far anyway.  Even if the socket is closing on the other
 	 * end, the AIO job holds a reference on this end of the socket
 	 * which will keep it open and keep the TCP PCB attached until
 	 * after the job is completed.
 	 */
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
 	    ddp_flags, ddp_flags_mask);
 	if (wr == NULL) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need a way to kick a retry here.
 		 *
 		 * XXX: We know the fixed size needed and could
 		 * preallocate this using a blocking request at the
 		 * start of the task to avoid having to handle this
 		 * edge case.
 		 */
 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
 		return;
 	}
 
 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
 		free_wrqe(wr);
 		recycle_pageset(toep, ps);
 		aio_ddp_cancel_one(job);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__,
 	    toep->tid, job, db_idx, ddp_flags, ddp_flags_mask);
 #endif
 	/* Give the chip the go-ahead. */
 	t4_wrq_tx(sc, wr);
 	db = &toep->ddp.db[db_idx];
 	db->cancel_pending = 0;
 	db->job = job;
 	db->ps = ps;
 	toep->ddp.queueing = NULL;
 	toep->ddp.flags |= buf_flag;
 	toep->ddp.active_count++;
 	if (toep->ddp.active_count == 1) {
 		MPASS(toep->ddp.active_id == -1);
 		toep->ddp.active_id = db_idx;
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 	}
 	goto restart;
 }
 
 void
 ddp_queue_toep(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
 		return;
 	toep->ddp.flags |= DDP_TASK_ACTIVE;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->ddp.requeue_task);
 }
 
 static void
 aio_ddp_requeue_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 
 	DDP_LOCK(toep);
 	aio_ddp_requeue(toep);
 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
 	DDP_UNLOCK(toep);
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aio_cancel_active(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	uint64_t valid_flag;
 	int i;
 
 	DDP_LOCK(toep);
 	if (aio_cancel_cleared(job)) {
 		DDP_UNLOCK(toep);
 		aio_ddp_cancel_one(job);
 		return;
 	}
 
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		if (toep->ddp.db[i].job == job) {
 			/* Should only ever get one cancel request for a job. */
 			MPASS(toep->ddp.db[i].cancel_pending == 0);
 
 			/*
 			 * Invalidate this buffer.  It will be
 			 * cancelled or partially completed once the
 			 * card ACKs the invalidate.
 			 */
 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
 			    V_TF_DDP_BUF1_VALID(1);
 			t4_set_tcb_field(sc, toep->ctrlq, toep,
 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
 			    i + DDP_BUF0_INVALIDATED);
 			toep->ddp.db[i].cancel_pending = 1;
 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
 			    __func__, job);
 			break;
 		}
 	}
 	DDP_UNLOCK(toep);
 }
 
 static void
 t4_aio_cancel_queued(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 	DDP_LOCK(toep);
 	if (!aio_cancel_cleared(job)) {
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (toep->ddp.waiting_count == 0)
 			ddp_queue_toep(toep);
 	}
 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
 	DDP_UNLOCK(toep);
 
 	aio_ddp_cancel_one(job);
 }
 
 int
 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 
 	/* Ignore writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
 		return (EOPNOTSUPP);
 
 	DDP_LOCK(toep);
 
 	/*
 	 * XXX: Think about possibly returning errors for ENOTCONN,
 	 * etc.  Perhaps the caller would only queue the request
 	 * if it failed with EOPNOTSUPP?
 	 */
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 #endif
 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
 	toep->ddp.waiting_count++;
 	toep->ddp.flags |= DDP_OK;
 
 	/*
 	 * Try to handle this request synchronously.  If this has
 	 * to block because the task is running, it will just bail
 	 * and let the task handle it instead.
 	 */
 	aio_ddp_requeue(toep);
 	DDP_UNLOCK(toep);
 	return (0);
 }
 
 void
 t4_ddp_mod_load(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	TAILQ_INIT(&ddp_orphan_pagesets);
 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
 }
 
 void
 t4_ddp_mod_unload(void)
 {
 
 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
 	mtx_destroy(&ddp_orphan_pagesets_lock);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
 }
 #endif
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 367333)
+++ head/sys/kern/init_main.c	(revision 367334)
@@ -1,889 +1,889 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
 #include "opt_init_path.h"
 #include "opt_verbose_sysinit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/epoch.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/copyright.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 struct thread0_storage thread0_st __aligned(32);
 struct	vmspace vmspace0;
 struct	proc *initproc;
 
 int
 linux_alloc_current_noop(struct thread *td __unused, int flags __unused)
 {
 	return (0);
 }
 int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop;
 
 #ifndef BOOTHOWTO
 #define	BOOTHOWTO	0
 #endif
 int	boothowto = BOOTHOWTO;	/* initialized so that it can be patched */
 SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
 	"Boot control flags, passed from loader");
 
 #ifndef BOOTVERBOSE
 #define	BOOTVERBOSE	0
 #endif
 int	bootverbose = BOOTVERBOSE;
 SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
 	"Control the output of verbose kernel messages");
 
 #ifdef VERBOSE_SYSINIT
 /*
  * We'll use the defined value of VERBOSE_SYSINIT from the kernel config to
  * dictate the default VERBOSE_SYSINIT behavior.  Significant values for this
  * option and associated tunable are:
  * - 0, 'compiled in but silent by default'
  * - 1, 'compiled in but verbose by default' (default)
  */
 int	verbose_sysinit = VERBOSE_SYSINIT;
 TUNABLE_INT("debug.verbose_sysinit", &verbose_sysinit);
 #endif
 
 #ifdef INVARIANTS
 FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance");
 #endif
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
 
 /*
  * The sysinit table itself.  Items are checked off as the are run.
  * If we want to register new sysinit types, add them to newsysinit.
  */
 SET_DECLARE(sysinit_set, struct sysinit);
 struct sysinit **sysinit, **sysinit_end;
 struct sysinit **newsysinit, **newsysinit_end;
 
 /*
  * Merge a new sysinit set into the current set, reallocating it if
  * necessary.  This can only be called after malloc is running.
  */
 void
 sysinit_add(struct sysinit **set, struct sysinit **set_end)
 {
 	struct sysinit **newset;
 	struct sysinit **sipp;
 	struct sysinit **xipp;
 	int count;
 
 	count = set_end - set;
 	if (newsysinit)
 		count += newsysinit_end - newsysinit;
 	else
 		count += sysinit_end - sysinit;
 	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
 	if (newset == NULL)
 		panic("cannot malloc for sysinit");
 	xipp = newset;
 	if (newsysinit)
 		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
 			*xipp++ = *sipp;
 	else
 		for (sipp = sysinit; sipp < sysinit_end; sipp++)
 			*xipp++ = *sipp;
 	for (sipp = set; sipp < set_end; sipp++)
 		*xipp++ = *sipp;
 	if (newsysinit)
 		free(newsysinit, M_TEMP);
 	newsysinit = newset;
 	newsysinit_end = newset + count;
 }
 
 #if defined (DDB) && defined(VERBOSE_SYSINIT)
 static const char *
 symbol_name(vm_offset_t va, db_strategy_t strategy)
 {
 	const char *name;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 	if (va == 0)
 		return (NULL);
 	sym = db_search_symbol(va, strategy, &offset);
 	if (offset != 0)
 		return (NULL);
 	db_symbol_values(sym, &name, NULL);
 	return (name);
 }
 #endif
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads".
  */
 void
 mi_startup(void)
 {
 
 	struct sysinit **sipp;	/* system initialization*/
 	struct sysinit **xipp;	/* interior loop of sort*/
 	struct sysinit *save;	/* bubble*/
 
 #if defined(VERBOSE_SYSINIT)
 	int last;
 	int verbose;
 #endif
 
 	TSENTER();
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	if (sysinit == NULL) {
 		sysinit = SET_BEGIN(sysinit_set);
 		sysinit_end = SET_LIMIT(sysinit_set);
 	}
 
 restart:
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			     ((*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 #if defined(VERBOSE_SYSINIT)
 	last = SI_SUB_COPYRIGHT;
 	verbose = 0;
 #if !defined(DDB)
 	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
 #endif
 #endif
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
 #if defined(VERBOSE_SYSINIT)
 		if ((*sipp)->subsystem > last && verbose_sysinit != 0) {
 			verbose = 1;
 			last = (*sipp)->subsystem;
 			printf("subsystem %x\n", last);
 		}
 		if (verbose) {
 #if defined(DDB)
 			const char *func, *data;
 
 			func = symbol_name((vm_offset_t)(*sipp)->func,
 			    DB_STGY_PROC);
 			data = symbol_name((vm_offset_t)(*sipp)->udata,
 			    DB_STGY_ANY);
 			if (func != NULL && data != NULL)
 				printf("   %s(&%s)... ", func, data);
 			else if (func != NULL)
 				printf("   %s(%p)... ", func, (*sipp)->udata);
 			else
 #endif
 				printf("   %p(%p)... ", (*sipp)->func,
 				    (*sipp)->udata);
 		}
 #endif
 
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
 #if defined(VERBOSE_SYSINIT)
 		if (verbose)
 			printf("done.\n");
 #endif
 
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
 		/* Check if we've installed more sysinit items via KLD */
 		if (newsysinit != NULL) {
 			if (sysinit != SET_BEGIN(sysinit_set))
 				free(sysinit, M_TEMP);
 			sysinit = newsysinit;
 			sysinit_end = newsysinit_end;
 			newsysinit = NULL;
 			newsysinit_end = NULL;
 			goto restart;
 		}
 	}
 
 	TSEXIT();	/* Here so we don't overlap with start_init. */
 
 	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(&Giant);
 
 	/*
 	 * Now hand over this thread to swapper.
 	 */
 	swapper();
 	/* NOTREACHED*/
 }
 
 static void
 print_caddr_t(void *data)
 {
 	printf("%s", (char *)data);
 }
 
 static void
 print_version(void *data __unused)
 {
 	int len;
 
 	/* Strip a trailing newline from version. */
 	len = strlen(version);
 	while (len > 0 && version[len - 1] == '\n')
 		len--;
 	printf("%.*s %s\n", len, version, machine);
 	printf("%s\n", compiler_version);
 }
 
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
     copyright);
 SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
     trademark);
 SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);
 
 #ifdef WITNESS
 static char wit_warn[] =
      "WARNING: WITNESS option enabled, expect reduced performance.\n";
 SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_FOURTH,
    print_caddr_t, wit_warn);
 SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_FOURTH,
    print_caddr_t, wit_warn);
 #endif
 
 #ifdef DIAGNOSTIC
 static char diag_warn[] =
      "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
 SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH,
     print_caddr_t, diag_warn);
 SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH,
     print_caddr_t, diag_warn);
 #endif
 
 static int
 null_fetch_syscall_args(struct thread *td __unused)
 {
 
 	panic("null_fetch_syscall_args");
 }
 
 static void
 null_set_syscall_retval(struct thread *td __unused, int error __unused)
 {
 
 	panic("null_set_syscall_retval");
 }
 
 struct sysentvec null_sysvec = {
 	.sv_size	= 0,
 	.sv_table	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= NULL,
 	.sv_sendsig	= NULL,
 	.sv_sigcode	= NULL,
 	.sv_szsigcode	= NULL,
 	.sv_name	= "null",
 	.sv_coredump	= NULL,
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= 0,
 	.sv_minuser	= VM_MIN_ADDRESS,
 	.sv_maxuser	= VM_MAXUSER_ADDRESS,
 	.sv_usrstack	= USRSTACK,
 	.sv_psstrings	= PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= NULL,
 	.sv_setregs	= NULL,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
 	.sv_flags	= 0,
 	.sv_set_syscall_retval = null_set_syscall_retval,
 	.sv_fetch_syscall_args = null_fetch_syscall_args,
 	.sv_syscallnames = NULL,
 	.sv_schedtail	= NULL,
 	.sv_thread_detach = NULL,
 	.sv_trap	= NULL,
 };
 
 /*
  * The two following SYSINIT's are proc0 specific glue code.  I am not
  * convinced that they can not be safely combined, but their order of
  * operation has been maintained as the same as the original init_main.c
  * for right now.
  */
 /* ARGSUSED*/
 static void
 proc0_init(void *dummy __unused)
 {
 	struct proc *p;
 	struct thread *td;
 	struct ucred *newcred;
 	struct uidinfo tmpuinfo;
 	struct loginclass tmplc = {
 		.lc_name = "",
 	};
 	vm_paddr_t pageablemem;
 	int i;
 
 	GIANT_REQUIRED;
 	p = &proc0;
 	td = &thread0;
 
 	/*
 	 * Initialize magic number and osrel.
 	 */
 	p->p_magic = P_MAGIC;
 	p->p_osrel = osreldate;
 
 	/*
 	 * Initialize thread and process structures.
 	 */
 	procinit();	/* set up proc zone */
 	threadinit();	/* set up UMA zones */
 
 	/*
 	 * Initialise scheduler resources.
 	 * Add scheduler specific parts to proc, thread as needed.
 	 */
 	schedinit();	/* scheduler gets its house in order */
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
 	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
 	refcount_init(&session0.s_count, 1);
 	session0.s_leader = p;
 
 	p->p_sysent = &null_sysvec;
 	p->p_flag = P_SYSTEM | P_INMEM | P_KPROC;
 	p->p_flag2 = 0;
 	p->p_state = PRS_NORMAL;
 	p->p_klist = knlist_alloc(&p->p_mtx);
 	STAILQ_INIT(&p->p_ktr);
 	p->p_nice = NZERO;
 	/* pid_max cannot be greater than PID_MAX */
 	td->td_tid = PID_MAX + 1;
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	td->td_state = TDS_RUNNING;
 	td->td_pri_class = PRI_TIMESHARE;
 	td->td_user_pri = PUSER;
 	td->td_base_user_pri = PUSER;
 	td->td_lend_user_pri = PRI_MAX;
 	td->td_priority = PVM;
 	td->td_base_pri = PVM;
 	td->td_oncpu = curcpu;
 	td->td_flags = TDF_INMEM;
 	td->td_pflags = TDP_KTHREAD;
 	td->td_cpuset = cpuset_thread0();
 	td->td_domain.dr_policy = td->td_cpuset->cs_domain;
 	prison0_init();
 	p->p_peers = 0;
 	p->p_leader = p;
 	p->p_reaper = p;
 	p->p_treeflag |= P_TREE_REAPER;
 	LIST_INIT(&p->p_reaplist);
 
 	strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
 	strncpy(td->td_name, "swapper", sizeof (td->td_name));
 
 	callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
 	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
 	callout_init(&td->td_slpcallout, 1);
 
 	/* Create credentials. */
 	newcred = crget();
 	newcred->cr_ngroups = 1;	/* group 0 */
 	/* A hack to prevent uifind from tripping over NULL pointers. */
 	curthread->td_ucred = newcred;
 	tmpuinfo.ui_uid = 1;
 	newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo;
 	newcred->cr_uidinfo = uifind(0);
 	newcred->cr_ruidinfo = uifind(0);
 	newcred->cr_loginclass = &tmplc;
 	newcred->cr_loginclass = loginclass_find("default");
 	/* End hack. creds get properly set later with thread_cow_get_proc */
 	curthread->td_ucred = NULL;
 	newcred->cr_prison = &prison0;
 	newcred->cr_users++; /* avoid assertion failure */
 	proc_set_cred_init(p, newcred);
 	newcred->cr_users--;
 	crfree(newcred);
 #ifdef AUDIT
 	audit_cred_kproc0(newcred);
 #endif
 #ifdef MAC
 	mac_cred_create_swapper(newcred);
 #endif
 	/* Create sigacts. */
 	p->p_sigacts = sigacts_alloc();
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 
 	/* Create the file descriptor table. */
 	p->p_fd = fdinit(NULL, false, NULL);
 	p->p_fdtol = NULL;
 
 	/* Create the limits structures. */
 	p->p_limit = lim_alloc();
 	for (i = 0; i < RLIM_NLIMITS; i++)
 		p->p_limit->pl_rlimit[i].rlim_cur =
 		    p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
 	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
 	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
 	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
 	/* Cast to avoid overflow on i386/PAE. */
 	pageablemem = ptoa((vm_paddr_t)vm_free_count());
 	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
 	p->p_cpulimit = RLIM_INFINITY;
 
 	PROC_LOCK(p);
 	thread_cow_get_proc(td, p);
 	PROC_UNLOCK(p);
 
 	/* Initialize resource accounting structures. */
 	racct_create(&p->p_racct);
 
 	p->p_stats = pstats_alloc();
 
 	/* Allocate a prototype map so we have something to fork. */
 	p->p_vmspace = &vmspace0;
-	vmspace0.vm_refcnt = 1;
+	refcount_init(&vmspace0.vm_refcnt, 1);
 	pmap_pinit0(vmspace_pmap(&vmspace0));
 
 	/*
 	 * proc0 is not expected to enter usermode, so there is no special
 	 * handling for sv_minuser here, like is done for exec_new_vmspace().
 	 */
 	vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
 	    p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
 
 	/*
 	 * Call the init and ctor for the new thread and proc.  We wait
 	 * to do this until all other structures are fairly sane.
 	 */
 	EVENTHANDLER_DIRECT_INVOKE(process_init, p);
 	EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
 	EVENTHANDLER_DIRECT_INVOKE(process_ctor, p);
 	EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
 	PROC_LOCK(p);
 	racct_add_force(p, RACCT_NPROC, 1);
 	PROC_UNLOCK(p);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
 
 /* ARGSUSED*/
 static void
 proc0_post(void *dummy __unused)
 {
 	struct proc *p;
 	struct rusage ru;
 	struct thread *td;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the filesystem.  Pretend that proc0 started now.
 	 */
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		microuptime(&p->p_stats->p_start);
 		PROC_STATLOCK(p);
 		rufetch(p, &ru);	/* Clears thread stats */
 		p->p_rux.rux_runtime = 0;
 		p->p_rux.rux_uticks = 0;
 		p->p_rux.rux_sticks = 0;
 		p->p_rux.rux_iticks = 0;
 		PROC_STATUNLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			td->td_runtime = 0;
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	PCPU_SET(switchtime, cpu_ticks());
 	PCPU_SET(switchticks, ticks);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 /*
  * List of paths to try when searching for "init".
  */
 static char init_path[MAXPATHLEN] =
 #ifdef	INIT_PATH
     __XSTRING(INIT_PATH);
 #else
     "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init";
 #endif
 SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
 	"Path used to search the init process");
 
 /*
  * Shutdown timeout of init(8).
  * Unused within kernel, but used to control init(8), hence do not remove.
  */
 #ifndef INIT_SHUTDOWN_TIMEOUT
 #define INIT_SHUTDOWN_TIMEOUT 120
 #endif
 static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
 SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
 	CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
 	"Unused within kernel, but used to control init(8)");
 
 /*
  * Start the initial user process; try exec'ing each pathname in init_path.
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(void *dummy)
 {
 	struct image_args args;
 	int error;
 	char *var, *path;
 	char *free_init_path, *tmp_init_path;
 	struct thread *td;
 	struct proc *p;
 	struct vmspace *oldvmspace;
 
 	TSENTER();	/* Here so we don't overlap with mi_startup. */
 
 	td = curthread;
 	p = td->td_proc;
 
 	vfs_mountroot();
 
 	/* Wipe GELI passphrase from the environment. */
 	kern_unsetenv("kern.geom.eli.passphrase");
 
 	/* For Multicons, report which console is primary to both */
 	if (boothowto & RB_MULTIPLE) {
 		if (boothowto & RB_SERIAL)
 			printf("Dual Console: Serial Primary, Video Secondary\n");
 		else
 			printf("Dual Console: Video Primary, Serial Secondary\n");
 	}
 
 	if ((var = kern_getenv("init_path")) != NULL) {
 		strlcpy(init_path, var, sizeof(init_path));
 		freeenv(var);
 	}
 	free_init_path = tmp_init_path = strdup(init_path, M_TEMP);
 
 	while ((path = strsep(&tmp_init_path, ":")) != NULL) {
 		if (bootverbose)
 			printf("start_init: trying %s\n", path);
 
 		memset(&args, 0, sizeof(args));
 		error = exec_alloc_args(&args);
 		if (error != 0)
 			panic("%s: Can't allocate space for init arguments %d",
 			    __func__, error);
 
 		error = exec_args_add_fname(&args, path, UIO_SYSSPACE);
 		if (error != 0)
 			panic("%s: Can't add fname %d", __func__, error);
 		error = exec_args_add_arg(&args, path, UIO_SYSSPACE);
 		if (error != 0)
 			panic("%s: Can't add argv[0] %d", __func__, error);
 		if (boothowto & RB_SINGLE)
 			error = exec_args_add_arg(&args, "-s", UIO_SYSSPACE);
 		if (error != 0)
 			panic("%s: Can't add argv[0] %d", __func__, error);
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise, return via fork_trampoline() all the way
 		 * to user mode as init!
 		 */
 		KASSERT((td->td_pflags & TDP_EXECVMSPC) == 0,
 		    ("nested execve"));
 		oldvmspace = td->td_proc->p_vmspace;
 		error = kern_execve(td, &args, NULL, oldvmspace);
 		KASSERT(error != 0,
 		    ("kern_execve returned success, not EJUSTRETURN"));
 		if (error == EJUSTRETURN) {
 			exec_cleanup(td, oldvmspace);
 			free(free_init_path, M_TEMP);
 			TSEXIT();
 			return;
 		}
 		if (error != ENOENT)
 			printf("exec %s: error %d\n", path, error);
 	}
 	free(free_init_path, M_TEMP);
 	printf("init: not found in path %s\n", init_path);
 	panic("no init");
 }
 
 /*
  * Like kproc_create(), but runs in its own address space.  We do this
  * early to reserve pid 1.  Note special case - do not make it
  * runnable yet, init execution is started when userspace can be served.
  */
 static void
 create_init(const void *udata __unused)
 {
 	struct fork_req fr;
 	struct ucred *newcred, *oldcred;
 	struct thread *td;
 	int error;
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
 	fr.fr_procp = &initproc;
 	error = fork1(&thread0, &fr);
 	if (error)
 		panic("cannot fork init: %d\n", error);
 	KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
 	/* divorce init's credentials from the kernel's */
 	newcred = crget();
 	sx_xlock(&proctree_lock);
 	PROC_LOCK(initproc);
 	initproc->p_flag |= P_SYSTEM | P_INMEM;
 	initproc->p_treeflag |= P_TREE_REAPER;
 	oldcred = initproc->p_ucred;
 	crcopy(newcred, oldcred);
 #ifdef MAC
 	mac_cred_create_init(newcred);
 #endif
 #ifdef AUDIT
 	audit_cred_proc1(newcred);
 #endif
 	proc_set_cred(initproc, newcred);
 	td = FIRST_THREAD_IN_PROC(initproc);
 	crcowfree(td);
 	td->td_realucred = crcowget(initproc->p_ucred);
 	td->td_ucred = td->td_realucred;
 	PROC_UNLOCK(initproc);
 	sx_xunlock(&proctree_lock);
 	crfree(oldcred);
 	cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc),
 	    start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
 
 /*
  * Make it runnable now.
  */
 static void
 kick_init(const void *udata __unused)
 {
 	struct thread *td;
 
 	td = FIRST_THREAD_IN_PROC(initproc);
 	thread_lock(td);
 	TD_SET_CAN_RUN(td);
 	sched_add(td, SRQ_BORING);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
 
 /*
  * DDB(4).
  */
 #ifdef DDB
 static void
 db_show_print_syinit(struct sysinit *sip, bool ddb)
 {
 	const char *sname, *funcname;
 	c_db_sym_t sym;
 	db_expr_t  offset;
 
 #define xprint(...)							\
 	if (ddb)							\
 		db_printf(__VA_ARGS__);					\
 	else								\
 		printf(__VA_ARGS__)
 
 	if (sip == NULL) {
 		xprint("%s: no sysinit * given\n", __func__);
 		return;
 	}
 
 	sym = db_search_symbol((vm_offset_t)sip, DB_STGY_ANY, &offset);
 	db_symbol_values(sym, &sname, NULL);
 	sym = db_search_symbol((vm_offset_t)sip->func, DB_STGY_PROC, &offset);
 	db_symbol_values(sym, &funcname, NULL);
 	xprint("%s(%p)\n", (sname != NULL) ? sname : "", sip);
 	xprint("  %#08x %#08x\n", sip->subsystem, sip->order);
 	xprint("  %p(%s)(%p)\n",
 	    sip->func, (funcname != NULL) ? funcname : "", sip->udata);
 #undef xprint
 }
 
 DB_SHOW_COMMAND(sysinit, db_show_sysinit)
 {
 	struct sysinit **sipp;
 
 	db_printf("SYSINIT vs Name(Ptr)\n");
 	db_printf("  Subsystem  Order\n");
 	db_printf("  Function(Name)(Arg)\n");
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		db_show_print_syinit(*sipp, true);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 367333)
+++ head/sys/kern/kern_exec.c	(revision 367334)
@@ -1,1841 +1,1842 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exec;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, , , exec, "char *");
 SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
 SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 int coredump_pack_fileinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
     &coredump_pack_fileinfo, 0,
     "Enable file path packing in 'procstat -f' coredump notes");
 
 int coredump_pack_vmmapinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
     &coredump_pack_vmmapinfo, 0,
     "Enable file path packing in 'procstat -v' coredump notes");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p, struct vmspace *oldvmspace);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU",
     "Location of process' ps_strings structure");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU",
     "Top of process stack");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_stackprot, "I",
     "Stack memory permissions");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0,
     "Process' command line characters cache limit");
 
 static int disallow_high_osrel;
 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
     &disallow_high_osrel, 0,
     "Disallow execution of binaries built for higher version of the world");
 
 static int map_at_zero = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0,
     "Permit processes to map an object at virtual address 0.");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 sys_execve(struct thread *td, struct execve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL, oldvmspace);
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fexecve_args {
 	int	fd;
 	char	**argv;
 	char	**envv;
 };
 #endif
 int
 sys_fexecve(struct thread *td, struct fexecve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL, oldvmspace);
 	}
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
 {
 #ifdef MAC
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p, oldvmspace);
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 int
 pre_execve(struct thread *td, struct vmspace **oldvmspace)
 {
 	struct proc *p;
 	int error;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	error = 0;
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		if (thread_single(p, SINGLE_BOUNDARY) != 0)
 			error = ERESTART;
 		PROC_UNLOCK(p);
 	}
 	KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("nested execve"));
 	*oldvmspace = p->p_vmspace;
 	return (error);
 }
 
 void
 post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
 {
 	struct proc *p;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == EJUSTRETURN)
 			thread_single(p, SINGLE_EXIT);
 		else
 			thread_single_end(p, SINGLE_BOUNDARY);
 		PROC_UNLOCK(p);
 	}
 	exec_cleanup(td, oldvmspace);
 }
 
 /*
  * kern_execve() has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p,
     struct vmspace *oldvmspace)
 {
 
 	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
 	    exec_args_get_begin_envv(args) - args->begin_argv);
 	AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc,
 	    args->endp - exec_args_get_begin_envv(args));
 	return (do_execve(td, args, mac_p, oldvmspace));
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(struct thread *td, struct image_args *args, struct mac *mac_p,
     struct vmspace *oldvmspace)
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd;
 	struct ucred *oldcred;
 	struct uidinfo *euip = NULL;
 	uintptr_t stack_base;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *oldtextvp = NULL, *newtextvp;
 	int credential_changing;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 	int error, i, orig_osrel;
 	uint32_t orig_fctl0;
 	static const char fexecv_proc_title[] = "(fexecv)";
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	bzero(imgp, sizeof(*imgp));
 	imgp->proc = p;
 	imgp->attr = &attr;
 	imgp->args = args;
 	oldcred = p->p_ucred;
 	orig_osrel = p->p_osrel;
 	orig_fctl0 = p->p_fctl0;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp among other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	if (args->fname != NULL) {
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW |
 		    SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
 	SDT_PROBE1(proc, , , exec, args->fname);
 
 interpret:
 	if (args->fname != NULL) {
 #ifdef CAPABILITY_MODE
 		/*
 		 * While capability mode can't reach this point via direct
 		 * path arguments to execve(), we also don't allow
 		 * interpreters to be used in capability mode (for now).
 		 * Catch indirect lookups and return a permissions error.
 		 */
 		if (IN_CAPABILITY_MODE(td)) {
 			error = ECAPMODE;
 			goto exec_fail;
 		}
 #endif
 		error = namei(&nd);
 		if (error)
 			goto exec_fail;
 
 		newtextvp = nd.ni_vp;
 		imgp->vp = newtextvp;
 	} else {
 		AUDIT_ARG_FD(args->fd);
 		/*
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
 		error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp);
 		if (error)
 			goto exec_fail;
 		vn_lock(newtextvp, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(newtextvp);
 		imgp->vp = newtextvp;
 	}
 
 	/*
 	 * Check file permissions.  Also 'opens' file and sets its vnode to
 	 * text mode.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
 	imgp->proc->p_fctl0 = 0;
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Determine new credentials before attempting image activators
 	 * so that it can be used by process_exec handlers to determine
 	 * credential/setid changes.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * We disable setuid/setgid/etc in capability mode on the basis
 	 * that most setugid applications are not written with that
 	 * environment in mind, and will therefore almost certainly operate
 	 * incorrectly. In principle there's no reason that setugid
 	 * applications might not be useful in capability mode, so we may want
 	 * to reconsider this conservative design choice in the future.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & S_ISUID) &&
 	    oldcred->cr_uid != attr.va_uid;
 	credential_changing |= (attr.va_mode & S_ISGID) &&
 	    oldcred->cr_gid != attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interpvplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	/* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */
 	if (credential_changing)
 		imgp->proc->p_pdeathsig = 0;
 
 	if (credential_changing &&
 #ifdef CAPABILITY_MODE
 	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
 #endif
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		imgp->credential_setid = true;
 		VOP_UNLOCK(imgp->vp);
 		imgp->newcred = crdup(oldcred);
 		if (attr.va_mode & S_ISUID) {
 			euip = uifind(attr.va_uid);
 			change_euid(imgp->newcred, euip);
 		}
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (attr.va_mode & S_ISGID)
 			change_egid(imgp->newcred, attr.va_gid);
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 	} else {
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			VOP_UNLOCK(imgp->vp);
 			imgp->newcred = crdup(oldcred);
 			vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 		}
 	}
 	/* The new credentials are installed into the process later. */
 
 	/*
 	 * Do the best to calculate the full path to the image file.
 	 */
 	if (args->fname != NULL && args->fname[0] == '/')
 		imgp->execpath = args->fname;
 	else {
 		VOP_UNLOCK(imgp->vp);
 		if (vn_fullpath(imgp->vp, &imgp->execpath, &imgp->freepath) != 0)
 			imgp->execpath = args->fname;
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1)
 			error = ENOEXEC;
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * The text reference needs to be removed for scripts.
 		 * There is a short period before we determine that
 		 * something is a script where text reference is active.
 		 * The vnode lock is held over this entire period
 		 * so nothing should illegitimately be blocked.
 		 */
 		MPASS(imgp->textset);
 		VOP_UNSET_TEXT_CHECKED(newtextvp);
 		imgp->textset = false;
 		/* free name buffer and old vnode */
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
 #endif
 		if (imgp->opened) {
 			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
 			imgp->opened = 0;
 		}
 		vput(newtextvp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		imgp->credential_setid = false;
 		if (imgp->newcred != NULL) {
 			crfree(imgp->newcred);
 			imgp->newcred = NULL;
 		}
 		imgp->execpath = NULL;
 		free(imgp->freepath, M_TEMP);
 		imgp->freepath = NULL;
 		/* set new name to that of the interpreter */
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW |
 		    SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td);
 		args->fname = imgp->interpreter_name;
 		goto interpret;
 	}
 
 	/*
 	 * NB: We unlock the vnode here because it is believed that none
 	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
 	 */
 	VOP_UNLOCK(imgp->vp);
 
 	if (disallow_high_osrel &&
 	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
 		error = ENOEXEC;
 		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
 		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	/* ABI enforces the use of Capsicum. Switch into capabilities mode. */
 	if (SV_PROC_FLAG(p, SV_CAPSICUM))
 		sys_cap_enter(td, NULL);
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base.
 	 */
 	error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base);
 	if (error != 0) {
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Stack setup.
 	 */
 	error = (*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	if (error != 0) {
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	if (args->fdp != NULL) {
 		/* Install a brand new file descriptor table. */
 		fdinstall_remapped(td, args->fdp);
 		args->fdp = NULL;
 	} else {
 		/*
 		 * Keep on using the existing file descriptor table. For
 		 * security and other reasons, the file descriptor table
 		 * cannot be shared after an exec.
 		 */
 		fdunshare(td);
 		/* close files on exec */
 		fdcloseexec(td);
 	}
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 	}
 
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 
 	PROC_LOCK(p);
 	if (oldsigacts)
 		p->p_sigacts = newsigacts;
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	bzero(p->p_comm, sizeof(p->p_comm));
 	if (args->fname)
 		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
 		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
 	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
 		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
 		p->p_flag2 &= ~P2_NOTRACE;
 	if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0)
 		p->p_flag2 &= ~P2_STKGAP_DISABLE;
 	if (p->p_flag & P_PPWAIT) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
 		/* STOPs are no longer ignored, arrange for AST */
 		signotify(td);
 	}
 
 	/*
 	 * Implement image setuid/setgid installation.
 	 */
 	if (imgp->credential_setid) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracecred != NULL &&
 		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED))
 			ktrprocexec(p, &tracecred, &tracevp);
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * Both fdsetugidsafety() and fdcheckstd() may call functions
 		 * taking sleepable locks, so temporarily drop our locks.
 		 */
 		PROC_UNLOCK(p);
 		VOP_UNLOCK(imgp->vp);
 		fdsetugidsafety(td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto exec_fail_dealloc;
 		PROC_LOCK(p);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, imgp->newcred,
 			    imgp->vp, interpvplabel, imgp);
 		}
 #endif
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 	}
 	/*
 	 * Set the new credentials.
 	 */
 	if (imgp->newcred != NULL) {
 		proc_set_cred(p, imgp->newcred);
 		crfree(oldcred);
 		oldcred = NULL;
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced by namei
 	 * or fgetvp_exec.
 	 */
 	oldtextvp = p->p_textvp;
 	p->p_textvp = newtextvp;
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exec if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exec)
 		dtrace_fasttrap_exec(p);
 #endif
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 	PROC_UNLOCK(p);
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		VOP_UNLOCK(imgp->vp);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 #endif
 
 	/* Set values passed into the program in registers. */
 	(*p->p_sysent->sv_setregs)(td, imgp, stack_base);
 
 	VOP_MMAPPED(imgp->vp);
 
 	SDT_PROBE1(proc, , , exec__success, args->fname);
 
 exec_fail_dealloc:
 	if (error != 0) {
 		p->p_osrel = orig_osrel;
 		p->p_fctl0 = orig_fctl0;
 	}
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		if (args->fname)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
 		if (imgp->textset)
 			VOP_UNSET_TEXT_CHECKED(imgp->vp);
 		if (error != 0)
 			vput(imgp->vp);
 		else
 			VOP_UNLOCK(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	free(imgp->freepath, M_TEMP);
 
 	if (error == 0) {
 		if (p->p_ptevents & PTRACE_EXEC) {
 			PROC_LOCK(p);
 			if (p->p_ptevents & PTRACE_EXEC)
 				td->td_dbgflags |= TDB_EXEC;
 			PROC_UNLOCK(p);
 		}
 	} else {
 exec_fail:
 		/* we're done here, clear P_INEXEC */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_INEXEC;
 		PROC_UNLOCK(p);
 
 		SDT_PROBE1(proc, , , exec__failure, error);
 	}
 
 	if (imgp->newcred != NULL && oldcred != NULL)
 		crfree(imgp->newcred);
 
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
 #endif
 	exec_free_args(args);
 
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (oldtextvp != NULL)
 		vrele(oldtextvp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	pargs_drop(oldargs);
 	pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 	if (euip != NULL)
 		uifree(euip);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exec_cleanup(td, oldvmspace);
 		exit1(td, 0, SIGABRT);
 		/* NOT REACHED */
 	}
 
 #ifdef KTRACE
 	if (error == 0)
 		ktrprocctor(p);
 #endif
 
 	/*
 	 * We don't want cpu_set_syscall_retval() to overwrite any of
 	 * the register values put in place by exec_setregs().
 	 * Implementations of cpu_set_syscall_retval() will leave
 	 * registers unmodified when returning EJUSTRETURN.
 	 */
 	return (error == 0 ? EJUSTRETURN : error);
 }
 
 void
 exec_cleanup(struct thread *td, struct vmspace *oldvmspace)
 {
 	if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
 		KASSERT(td->td_proc->p_vmspace != oldvmspace,
 		    ("oldvmspace still used"));
 		vmspace_free(oldvmspace);
 		td->td_pflags &= ~TDP_EXECVMSPC;
 	}
 }
 
 int
 exec_map_first_page(struct image_params *imgp)
 {
 	vm_object_t object;
 	vm_page_t m;
 	int error;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 #if VM_NRESERVLEVEL > 0
 	if ((object->flags & OBJ_COLORED) == 0) {
 		VM_OBJECT_WLOCK(object);
 		vm_object_color(object, 0);
 		VM_OBJECT_WUNLOCK(object);
 	}
 #endif
 	error = vm_page_grab_valid_unlocked(&m, object, 0,
 	    VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) |
             VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
 
 	if (error != VM_PAGER_OK)
 		return (EIO);
 	imgp->firstpage = sf_buf_alloc(m, 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(struct image_params *imgp)
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_unwire(m, PQ_ACTIVE);
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack.
  *	The new stack is only sgrowsiz large because it is grown
  *	automatically on a page fault.
  */
 int
 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	struct thread *td = curthread;
 	vm_object_t obj;
 	struct rlimit rlim_stack;
 	vm_offset_t sv_minuser, stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	sigfastblock_clear(td);
 
 	/* May be called with Giant held */
 	EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (map_at_zero)
 		sv_minuser = sv->sv_minuser;
 	else
 		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
-	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
+	if (refcount_load(&vmspace->vm_refcnt) == 1 &&
+	    vm_map_min(map) == sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser &&
 	    cpu_exec_vmspace_reuse(p, map)) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 		/*
 		 * An exec terminates mlockall(MCL_FUTURE), ASLR state
 		 * must be re-evaluated.
 		 */
 		vm_map_lock(map);
 		vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR |
 		    MAP_ASLR_IGNSTART);
 		vm_map_unlock(map);
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 		if (error)
 			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 	map->flags |= imgp->map_flags;
 
 	/* Map a shared page */
 	obj = sv->sv_shared_page_obj;
 	if (obj != NULL) {
 		vm_object_reference(obj);
 		error = vm_map_fixed(map, obj, 0,
 		    sv->sv_shared_page_base, sv->sv_shared_page_len,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 		if (error != KERN_SUCCESS) {
 			vm_object_deallocate(obj);
 			return (vm_mmap_to_errno(error));
 		}
 	}
 
 	/* Allocate a new stack */
 	if (imgp->stack_sz != 0) {
 		ssiz = trunc_page(imgp->stack_sz);
 		PROC_LOCK(p);
 		lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
 		PROC_UNLOCK(p);
 		if (ssiz > rlim_stack.rlim_max)
 			ssiz = rlim_stack.rlim_max;
 		if (ssiz > rlim_stack.rlim_cur) {
 			rlim_stack.rlim_cur = ssiz;
 			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
 		}
 	} else if (sv->sv_maxssiz != NULL) {
 		ssiz = *sv->sv_maxssiz;
 	} else {
 		ssiz = maxssiz;
 	}
 	imgp->eff_stack_sz = lim_cur(curthread, RLIMIT_STACK);
 	if (ssiz < imgp->eff_stack_sz)
 		imgp->eff_stack_sz = ssiz;
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error != KERN_SUCCESS)
 		return (vm_mmap_to_errno(error));
 
 	/*
 	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
 	 * are still used to enforce the stack rlimit on the process stack.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)stack_addr;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, const char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	u_long arg, env;
 	int error;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	error = exec_args_add_fname(args, fname, segflg);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	for (;;) {
 		error = fueword(argv++, &arg);
 		if (error == -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if (arg == 0)
 			break;
 		error = exec_args_add_arg(args, (char *)(uintptr_t)arg,
 		    UIO_USERSPACE);
 		if (error != 0)
 			goto err_exit;
 	}
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		for (;;) {
 			error = fueword(envv++, &env);
 			if (error == -1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if (env == 0)
 				break;
 			error = exec_args_add_env(args,
 			    (char *)(uintptr_t)env, UIO_USERSPACE);
 			if (error != 0)
 				goto err_exit;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 exec_copyin_data_fds(struct thread *td, struct image_args *args,
     const void *data, size_t datalen, const int *fds, size_t fdslen)
 {
 	struct filedesc *ofdp;
 	const char *p;
 	int *kfds;
 	int error;
 
 	memset(args, '\0', sizeof(*args));
 	ofdp = td->td_proc->p_fd;
 	if (datalen >= ARG_MAX || fdslen >= ofdp->fd_nfiles)
 		return (E2BIG);
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	args->begin_argv = args->buf;
 	args->stringspace = ARG_MAX;
 
 	if (datalen > 0) {
 		/*
 		 * Argument buffer has been provided. Copy it into the
 		 * kernel as a single string and add a terminating null
 		 * byte.
 		 */
 		error = copyin(data, args->begin_argv, datalen);
 		if (error != 0)
 			goto err_exit;
 		args->begin_argv[datalen] = '\0';
 		args->endp = args->begin_argv + datalen + 1;
 		args->stringspace -= datalen + 1;
 
 		/*
 		 * Traditional argument counting. Count the number of
 		 * null bytes.
 		 */
 		for (p = args->begin_argv; p < args->endp; ++p)
 			if (*p == '\0')
 				++args->argc;
 	} else {
 		/* No argument buffer provided. */
 		args->endp = args->begin_argv;
 	}
 
 	/* Create new file descriptor table. */
 	kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK);
 	error = copyin(fds, kfds, fdslen * sizeof(int));
 	if (error != 0) {
 		free(kfds, M_TEMP);
 		goto err_exit;
 	}
 	error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp);
 	free(kfds, M_TEMP);
 	if (error != 0)
 		goto err_exit;
 
 	return (0);
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 struct exec_args_kva {
 	vm_offset_t addr;
 	u_int gen;
 	SLIST_ENTRY(exec_args_kva) next;
 };
 
 DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva);
 
 static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist;
 static struct mtx exec_args_kva_mtx;
 static u_int exec_args_gen;
 
 static void
 exec_prealloc_args_kva(void *arg __unused)
 {
 	struct exec_args_kva *argkva;
 	u_int i;
 
 	SLIST_INIT(&exec_args_kva_freelist);
 	mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF);
 	for (i = 0; i < exec_map_entries; i++) {
 		argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK);
 		argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size);
 		argkva->gen = exec_args_gen;
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 	}
 }
 SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL);
 
 static vm_offset_t
 exec_alloc_args_kva(void **cookie)
 {
 	struct exec_args_kva *argkva;
 
 	argkva = (void *)atomic_readandclear_ptr(
 	    (uintptr_t *)DPCPU_PTR(exec_args_kva));
 	if (argkva == NULL) {
 		mtx_lock(&exec_args_kva_mtx);
 		while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL)
 			(void)mtx_sleep(&exec_args_kva_freelist,
 			    &exec_args_kva_mtx, 0, "execkva", 0);
 		SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 	*(struct exec_args_kva **)cookie = argkva;
 	return (argkva->addr);
 }
 
 static void
 exec_release_args_kva(struct exec_args_kva *argkva, u_int gen)
 {
 	vm_offset_t base;
 
 	base = argkva->addr;
 	if (argkva->gen != gen) {
 		(void)vm_map_madvise(exec_map, base, base + exec_map_entry_size,
 		    MADV_FREE);
 		argkva->gen = gen;
 	}
 	if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva),
 	    (uintptr_t)NULL, (uintptr_t)argkva)) {
 		mtx_lock(&exec_args_kva_mtx);
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 		wakeup_one(&exec_args_kva_freelist);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 }
 
 static void
 exec_free_args_kva(void *cookie)
 {
 
 	exec_release_args_kva(cookie, exec_args_gen);
 }
 
 static void
 exec_args_kva_lowmem(void *arg __unused)
 {
 	SLIST_HEAD(, exec_args_kva) head;
 	struct exec_args_kva *argkva;
 	u_int gen;
 	int i;
 
 	gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1;
 
 	/*
 	 * Force an madvise of each KVA range. Any currently allocated ranges
 	 * will have MADV_FREE applied once they are freed.
 	 */
 	SLIST_INIT(&head);
 	mtx_lock(&exec_args_kva_mtx);
 	SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva);
 	mtx_unlock(&exec_args_kva_mtx);
 	while ((argkva = SLIST_FIRST(&head)) != NULL) {
 		SLIST_REMOVE_HEAD(&head, next);
 		exec_release_args_kva(argkva, gen);
 	}
 
 	CPU_FOREACH(i) {
 		argkva = (void *)atomic_readandclear_ptr(
 		    (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva));
 		if (argkva != NULL)
 			exec_release_args_kva(argkva, gen);
 	}
 }
 EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL,
     EVENTHANDLER_PRI_ANY);
 
 /*
  * Allocate temporary demand-paged, zero-filled memory for the file name,
  * argument, and environment strings.
  */
 int
 exec_alloc_args(struct image_args *args)
 {
 
 	args->buf = (char *)exec_alloc_args_kva(&args->bufkva);
 	return (0);
 }
 
 void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf != NULL) {
 		exec_free_args_kva(args->bufkva);
 		args->buf = NULL;
 	}
 	if (args->fname_buf != NULL) {
 		free(args->fname_buf, M_TEMP);
 		args->fname_buf = NULL;
 	}
 	if (args->fdp != NULL)
 		fdescfree_remapped(args->fdp);
 }
 
 /*
  * A set to functions to fill struct image args.
  *
  * NOTE: exec_args_add_fname() must be called (possibly with a NULL
  * fname) before the other functions.  All exec_args_add_arg() calls must
  * be made before any exec_args_add_env() calls.  exec_args_adjust_args()
  * may be called any time after exec_args_add_fname().
  *
  * exec_args_add_fname() - install path to be executed
  * exec_args_add_arg() - append an argument string
  * exec_args_add_env() - append an env string
  * exec_args_adjust_args() - adjust location of the argument list to
  *                           allow new arguments to be prepended
  */
 int
 exec_args_add_fname(struct image_args *args, const char *fname,
     enum uio_seg segflg)
 {
 	int error;
 	size_t length;
 
 	KASSERT(args->fname == NULL, ("fname already appended"));
 	KASSERT(args->endp == NULL, ("already appending to args"));
 
 	if (fname != NULL) {
 		args->fname = args->buf;
 		error = segflg == UIO_SYSSPACE ?
 		    copystr(fname, args->fname, PATH_MAX, &length) :
 		    copyinstr(fname, args->fname, PATH_MAX, &length);
 		if (error != 0)
 			return (error == ENAMETOOLONG ? E2BIG : error);
 	} else
 		length = 0;
 
 	/* Set up for _arg_*()/_env_*() */
 	args->endp = args->buf + length;
 	/* begin_argv must be set and kept updated */
 	args->begin_argv = args->endp;
 	KASSERT(exec_map_entry_size - length >= ARG_MAX,
 	    ("too little space remaining for arguments %zu < %zu",
 	    exec_map_entry_size - length, (size_t)ARG_MAX));
 	args->stringspace = ARG_MAX;
 
 	return (0);
 }
 
 static int
 exec_args_add_str(struct image_args *args, const char *str,
     enum uio_seg segflg, int *countp)
 {
 	int error;
 	size_t length;
 
 	KASSERT(args->endp != NULL, ("endp not initialized"));
 	KASSERT(args->begin_argv != NULL, ("begin_argp not initialized"));
 
 	error = (segflg == UIO_SYSSPACE) ?
 	    copystr(str, args->endp, args->stringspace, &length) :
 	    copyinstr(str, args->endp, args->stringspace, &length);
 	if (error != 0)
 		return (error == ENAMETOOLONG ? E2BIG : error);
 	args->stringspace -= length;
 	args->endp += length;
 	(*countp)++;
 
 	return (0);
 }
 
 int
 exec_args_add_arg(struct image_args *args, const char *argp,
     enum uio_seg segflg)
 {
 
 	KASSERT(args->envc == 0, ("appending args after env"));
 
 	return (exec_args_add_str(args, argp, segflg, &args->argc));
 }
 
 int
 exec_args_add_env(struct image_args *args, const char *envp,
     enum uio_seg segflg)
 {
 
 	if (args->envc == 0)
 		args->begin_envv = args->endp;
 
 	return (exec_args_add_str(args, envp, segflg, &args->envc));
 }
 
 int
 exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend)
 {
 	ssize_t offset;
 
 	KASSERT(args->endp != NULL, ("endp not initialized"));
 	KASSERT(args->begin_argv != NULL, ("begin_argp not initialized"));
 
 	offset = extend - consume;
 	if (args->stringspace < offset)
 		return (E2BIG);
 	memmove(args->begin_argv + extend, args->begin_argv + consume,
 	    args->endp - args->begin_argv + consume);
 	if (args->envc > 0)
 		args->begin_envv += offset;
 	args->endp += offset;
 	args->stringspace -= offset;
 	return (0);
 }
 
 char *
 exec_args_get_begin_envv(struct image_args *args)
 {
 
 	KASSERT(args->endp != NULL, ("endp not initialized"));
 
 	if (args->envc > 0)
 		return (args->begin_envv);
 	return (args->endp);
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 int
 exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base)
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp;
 	uintptr_t destp, ustringp;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	size_t execpath_len;
 	int error, szsigcode, szps;
 	char canary[sizeof(long) * 8];
 
 	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	imgp->ps_strings = arginfo;
 	if (p->p_sysent->sv_sigcode_base == 0) {
 		if (p->p_sysent->sv_szsigcode != NULL)
 			szsigcode = *(p->p_sysent->sv_szsigcode);
 	}
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0) {
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(void *));
 		error = copyout(p->p_sysent->sv_sigcode, (void *)destp,
 		    szsigcode);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		destp -= execpath_len;
 		destp = rounddown2(destp, sizeof(void *));
 		imgp->execpathp = (void *)destp;
 		error = copyout(imgp->execpath, imgp->execpathp, execpath_len);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = (void *)destp;
 	error = copyout(canary, imgp->canary, sizeof(canary));
 	if (error != 0)
 		return (error);
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	destp -= szps;
 	destp = rounddown2(destp, sizeof(void *));
 	imgp->pagesizes = (void *)destp;
 	error = copyout(pagesizes, imgp->pagesizes, szps);
 	if (error != 0)
 		return (error);
 	imgp->pagesizeslen = szps;
 
 	/*
 	 * Allocate room for the argument and environment strings.
 	 */
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(void *));
 	ustringp = destp;
 
 	if (imgp->sysent->sv_stackgap != NULL)
 		imgp->sysent->sv_stackgap(imgp, &destp);
 
 	if (imgp->auxargs) {
 		/*
 		 * Allocate room on the stack for the ELF auxargs
 		 * array.  It has up to AT_COUNT entries.
 		 */
 		destp -= AT_COUNT * sizeof(Elf_Auxinfo);
 		destp = rounddown2(destp, sizeof(void *));
 	}
 
 	vectp = (char **)destp;
 
 	/*
 	 * Allocate room for the argv[] and env vectors including the
 	 * terminating NULL pointers.
 	 */
 	vectp -= imgp->args->argc + 1 + imgp->args->envc + 1;
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	*stack_base = (uintptr_t)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	error = copyout(stringp, (void *)ustringp,
 	    ARG_MAX - imgp->args->stringspace);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	imgp->argv = vectp;
 	if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 ||
 	    suword32(&arginfo->ps_nargvstr, argc) != 0)
 		return (EFAULT);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		if (suword(vectp++, ustringp) != 0)
 			return (EFAULT);
 		while (*stringp++ != 0)
 			ustringp++;
 		ustringp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	if (suword(vectp++, 0) != 0)
 		return (EFAULT);
 
 	imgp->envv = vectp;
 	if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 ||
 	    suword32(&arginfo->ps_nenvstr, envc) != 0)
 		return (EFAULT);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		if (suword(vectp++, ustringp) != 0)
 			return (EFAULT);
 		while (*stringp++ != 0)
 			ustringp++;
 		ustringp++;
 	}
 
 	/* end of vector table is a null pointer */
 	if (suword(vectp, 0) != 0)
 		return (EFAULT);
 
 	if (imgp->auxargs) {
 		vectp++;
 		error = imgp->sysent->sv_copyout_auxargs(imgp,
 		    (uintptr_t)vectp);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(struct image_params *imgp)
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that
 	 *    this file resides on.
 	 * 2) Ensure that at least one execute bit is on. Otherwise, a
 	 *    privileged user will always succeed, and we don't want this
 	 *    to happen unless the file really is executable.
 	 * 3) Ensure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 *
 	 * Add a text reference now so no one can write to the
 	 * executable while we're activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	error = VOP_SET_TEXT(vp);
 	if (error != 0)
 		return (error);
 	imgp->textset = true;
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error == 0)
 		imgp->opened = 1;
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	u_int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/vfs_aio.c
===================================================================
--- head/sys/kern/vfs_aio.c	(revision 367333)
+++ head/sys/kern/vfs_aio.c	(revision 367334)
@@ -1,2986 +1,2987 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 #include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 FEATURE(aio, "Asynchronous I/O");
 SYSCTL_DECL(_p1003_1b);
 
 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
 static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Async IO management");
 
 static int enable_aio_unsafe = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
     "Permit asynchronous IO on all file types, not just known-safe types");
 
 static unsigned int unsafe_warningcnt = 1;
 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
     &unsafe_warningcnt, 0,
     "Warnings that will be triggered upon failed IO requests on unsafe files");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
     "Maximum number of kernel processes to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
     "Number of presently active kernel processes for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
     0,
     "Preferred number of ready kernel processes for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 static int num_unmapped_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
     0,
     "Number of aio requests presently handled by unmapped I/O buffers");
 
 /* Number of async I/O processes in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0,
     "Maximum active aio requests per process");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process");
 
 /* 
  * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
  * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
  * vfs.aio.aio_listio_max.
  */
 SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
     CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
     0, "Maximum aio requests for a single lio_listio call");
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 #endif
 
 /*
  * Below is a key of locks used to protect each member of struct kaiocb
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * If the routine that services an AIO request blocks while running in an
  * AIO kernel process it can starve other I/O requests.  BIO requests
  * queued via aio_qbio() complete asynchronously and do not use AIO kernel
  * processes at all.  Socket I/O requests use a separate pool of
  * kprocs and also force non-blocking I/O.  Other file I/O requests
  * use the generic fo_read/fo_write operations which can block.  The
  * fsync and mlock operations can also block while executing.  Ideally
  * none of these requests would block while executing.
  *
  * Note that the service routines cannot toggle O_NONBLOCK in the file
  * structure directly while handling a request due to races with
  * userland threads.
  */
 
 /* jobflags */
 #define	KAIOCB_QUEUEING		0x01
 #define	KAIOCB_CANCELLED	0x02
 #define	KAIOCB_CANCELLING	0x04
 #define	KAIOCB_CHECKSYNC	0x08
 #define	KAIOCB_CLEARED		0x10
 #define	KAIOCB_FINISHED		0x20
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aioproc {
 	int	aioprocflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
 	struct	proc *aioproc;			/* (*) the AIO proc */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) listio flags */
 	int	lioj_finished_count;		/* (a) listio flags */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct	knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_buffer_count;	/* (a) number of bio buffers */
 	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
 	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
 	struct	task kaio_task;		/* (*) task to kick aio processes */
 	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
 
 /*
  * Operations used to interact with userland aio control blocks.
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
 	int	(*aio_copyin)(struct aiocb *ujob, struct aiocb *kjob);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
 	int	(*store_error)(struct aiocb *ujob, long error);
 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
 };
 
 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
 static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_biowakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
 		    struct image_params *imgp);
 static int	aio_qbio(struct proc *p, struct kaiocb *job);
 static void	aio_daemon(void *param);
 static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
 static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiop	async io process data
  *	aiocb	async io jobs
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
 };
 static struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
 	.f_event = filt_lio
 };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_kick);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static int
 aio_onceonly(void)
 {
 
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
 	ki->kaio_flags = 0;
 	ki->kaio_active_count = 0;
 	ki->kaio_count = 0;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TAILQ_INIT(&ki->kaio_syncready);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td;
 	int error;
 
 	error = sigev_findtd(p, sigev, &td);
 	if (error)
 		return (error);
 	if (!KSI_ONQ(ksi)) {
 		ksiginfo_set_sigev(ksi, sigev);
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = job->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, job, plist);
 	TAILQ_REMOVE(&ki->kaio_all, job, allist);
 
 	lj = job->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* job is going away, we need to destroy any knotes */
 	knlist_delete(&job->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&job->ksi);
 	PROC_UNLOCK(p);
 
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * a kaiocb from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p,
     struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 static int
 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
 {
 	aio_cancel_fn_t *func;
 	int cancelled;
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
 		return (0);
 	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
 	job->jobflags |= KAIOCB_CANCELLED;
 
 	func = job->cancel_fn;
 
 	/*
 	 * If there is no cancel routine, just leave the job marked as
 	 * cancelled.  The job should be in active use by a caller who
 	 * should complete it normally or when it fails to install a
 	 * cancel routine.
 	 */
 	if (func == NULL)
 		return (0);
 
 	/*
 	 * Set the CANCELLING flag so that aio_complete() will defer
 	 * completions of this job.  This prevents the job from being
 	 * freed out from under the cancel callback.  After the
 	 * callback any deferred completion (whether from the callback
 	 * or any other source) will be completed.
 	 */
 	job->jobflags |= KAIOCB_CANCELLING;
 	AIO_UNLOCK(ki);
 	func(job);
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_CANCELLING;
 	if (job->jobflags & KAIOCB_FINISHED) {
 		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(p, job);
 	} else {
 		/*
 		 * The cancel callback might have scheduled an
 		 * operation to cancel this request, but it is
 		 * only counted as cancelled if the request is
 		 * cancelled when the callback returns.
 		 */
 		cancelled = 0;
 	}
 	return (cancelled);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kaiocb *job, *jobn;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		aio_cancel_job(p, ki, job);
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(job);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct kaiocb *
 aio_selectjob(struct aioproc *aiop)
 {
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 restart:
 	TAILQ_FOREACH(job, &aio_jobs, list) {
 		userp = job->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < max_aio_per_proc) {
 			TAILQ_REMOVE(&aio_jobs, job, list);
 			if (!aio_clear_cancel_function(job))
 				goto restart;
 
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			break;
 		}
 	}
 	return (job);
 }
 
 /*
  * Move all data to a permanent storage device.  This code
  * simulates the fsync syscall.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 drop:
 	return (error);
 }
 
 /*
  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
  * does the I/O request for the non-bio version of the operations.  The normal
  * vn operations are used, and this code should work in all instances for every
  * type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process_rw(struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct aiocb *cb;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = cb->aio_offset;
 	auio.uio_resid = cb->aio_nbytes;
 	cnt = cb->aio_nbytes;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
 		if (auio.uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	job->msgrcv = msgrcv_end - msgrcv_st;
 	job->msgsnd = msgsnd_end - msgsnd_st;
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
 	if ((error) && (auio.uio_resid != cnt)) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	cnt -= auio.uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, cnt, 0);
 }
 
 static void
 aio_process_sync(struct kaiocb *job)
 {
 	struct thread *td = curthread;
 	struct ucred *td_savedcred = td->td_ucred;
 	struct file *fp = job->fd_file;
 	int error = 0;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	td->td_ucred = job->cred;
 	if (fp->f_vnode != NULL)
 		error = aio_fsync_vnode(td, fp->f_vnode);
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_process_mlock(struct kaiocb *job)
 {
 	struct aiocb *cb = &job->uaiocb;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	error = kern_mlock(job->userproc, job->cred,
 	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
 	aio_complete(job, error != 0 ? -1 : 0, error);
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *sjob, *sjobn;
 	int lj_done;
 	bool schedule_fsync;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = job->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
 
 	KNOTE_LOCKED(&job->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL
 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (job->jobflags & KAIOCB_CHECKSYNC) {
 		schedule_fsync = false;
 		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
 			if (job->fd_file != sjob->fd_file ||
 			    job->seqno >= sjob->seqno)
 				continue;
 			if (--sjob->pending > 0)
 				continue;
 			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
 			if (!aio_clear_cancel_function_locked(sjob))
 				continue;
 			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
 			schedule_fsync = true;
 		}
 		if (schedule_fsync)
 			taskqueue_enqueue(taskqueue_aiod_kick,
 			    &ki->kaio_sync_task);
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 static void
 aio_schedule_fsync(void *context, int pending)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 
 	ki = context;
 	AIO_LOCK(ki);
 	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
 		job = TAILQ_FIRST(&ki->kaio_syncready);
 		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		AIO_LOCK(ki);
 	}
 	AIO_UNLOCK(ki);
 }
 
 bool
 aio_cancel_cleared(struct kaiocb *job)
 {
 
 	/*
 	 * The caller should hold the same queue lock held when
 	 * aio_clear_cancel_function() was called and set this flag
 	 * ensuring this check sees an up-to-date value.  However,
 	 * there is no way to assert that.
 	 */
 	return ((job->jobflags & KAIOCB_CLEARED) != 0);
 }
 
 static bool
 aio_clear_cancel_function_locked(struct kaiocb *job)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	MPASS(job->cancel_fn != NULL);
 	if (job->jobflags & KAIOCB_CANCELLING) {
 		job->jobflags |= KAIOCB_CLEARED;
 		return (false);
 	}
 	job->cancel_fn = NULL;
 	return (true);
 }
 
 bool
 aio_clear_cancel_function(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_clear_cancel_function_locked(job);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 static bool
 aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	if (job->jobflags & KAIOCB_CANCELLED)
 		return (false);
 	job->cancel_fn = func;
 	return (true);
 }
 
 bool
 aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_set_cancel_function_locked(job, func);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 void
 aio_complete(struct kaiocb *job, long status, int error)
 {
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	job->uaiocb._aiocb_private.error = error;
 	job->uaiocb._aiocb_private.status = status;
 
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 
 	AIO_LOCK(ki);
 	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
 	    ("duplicate aio_complete"));
 	job->jobflags |= KAIOCB_FINISHED;
 	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(userp, job);
 	}
 	AIO_UNLOCK(ki);
 }
 
 void
 aio_cancel(struct kaiocb *job)
 {
 
 	aio_complete(job, -1, ECANCELED);
 }
 
 void
 aio_switch_vmspace(struct kaiocb *job)
 {
 
 	vmspace_switch_aio(job->userproc->p_vmspace);
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct kaiocb *job;
 	struct aioproc *aiop;
 	struct kaioinfo *ki;
 	struct proc *p;
 	struct vmspace *myvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = td->td_proc;
 	myvm = vmspace_acquire_ref(p);
 
 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aioproc = p;
 	aiop->aioprocflags = 0;
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aioprocflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aioprocflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((job = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 
 			ki = job->userproc->p_aioinfo;
 			job->handle_fn(job);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&aio_job_mtx);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aioprocflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
 		    (aiop->aioprocflags & AIOP_FREE) &&
 		    num_aio_procs > target_aio_procs)
 			break;
 	}
 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
 	num_aio_procs--;
 	mtx_unlock(&aio_job_mtx);
 	uma_zfree(aiop_zone, aiop);
 	free_unr(aiod_unr, id);
 	vmspace_free(myvm);
 
 	KASSERT(p->p_vmspace == myvm,
 	    ("AIOD: bad vmspace for exiting daemon"));
-	KASSERT(myvm->vm_refcnt > 1,
-	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
+	KASSERT(refcount_load(&myvm->vm_refcnt) > 1,
+	    ("AIOD: bad vm refcnt for exiting daemon: %d",
+	    refcount_load(&myvm->vm_refcnt)));
 	kproc_exit(0);
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead bio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qbio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct bio *bp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	int error, ref, poff;
 	vm_prot_t prot;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	if (!(cb->aio_lio_opcode == LIO_WRITE ||
 	    cb->aio_lio_opcode == LIO_READ))
 		return (-1);
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VCHR)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	if ((csw->d_flags & D_DISK) == 0) {
 		error = -1;
 		goto unref;
 	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
 	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
 	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
 		if (cb->aio_nbytes > MAXPHYS) {
 			error = -1;
 			goto unref;
 		}
 
 		pbuf = NULL;
 	} else {
 		if (cb->aio_nbytes > MAXPHYS - poff) {
 			error = -1;
 			goto unref;
 		}
 		if (ki->kaio_buffer_count >= max_buf_aio) {
 			error = EAGAIN;
 			goto unref;
 		}
 
 		job->pbuf = pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
 		BUF_KERNPROC(pbuf);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count++;
 		AIO_UNLOCK(ki);
 	}
 	job->bp = bp = g_alloc_bio();
 
 	bp->bio_length = cb->aio_nbytes;
 	bp->bio_bcount = cb->aio_nbytes;
 	bp->bio_done = aio_biowakeup;
 	bp->bio_offset = cb->aio_offset;
 	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
 	bp->bio_dev = dev;
 	bp->bio_caller1 = (void *)job;
 
 	prot = VM_PROT_READ;
 	if (cb->aio_lio_opcode == LIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)cb->aio_buf, bp->bio_length, prot, job->pages,
 	    nitems(job->pages));
 	if (job->npages < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
 	if (pbuf != NULL) {
 		pmap_qenter((vm_offset_t)pbuf->b_data,
 		    job->pages, job->npages);
 		bp->bio_data = pbuf->b_data + poff;
 		atomic_add_int(&num_buf_aio, 1);
 	} else {
 		bp->bio_ma = job->pages;
 		bp->bio_ma_n = job->npages;
 		bp->bio_ma_offset = poff;
 		bp->bio_data = unmapped_buf;
 		bp->bio_flags |= BIO_UNMAPPED;
 		atomic_add_int(&num_unmapped_aio, 1);
 	}
 
 	/* Perform transfer. */
 	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	if (pbuf != NULL) {
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 		uma_zfree(pbuf_zone, pbuf);
 		job->pbuf = NULL;
 	}
 	g_destroy_bio(bp);
 	job->bp = NULL;
 unref:
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	nsig->sigev_notify = osig->sigev_notify;
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb *ojob;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
 	if (error)
 		return (error);
 	ojob = (struct oaiocb *)kjob;
 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 
 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
 }
 
 static long
 aiocb_fetch_status(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.status));
 }
 
 static long
 aiocb_fetch_error(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.error));
 }
 
 static int
 aiocb_store_status(struct aiocb *ujob, long status)
 {
 
 	return (suword(&ujob->_aiocb_private.status, status));
 }
 
 static int
 aiocb_store_error(struct aiocb *ujob, long error)
 {
 
 	return (suword(&ujob->_aiocb_private.error, error));
 }
 
 static int
 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 
 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb_ops = {
 	.aio_copyin = aiocb_copyin,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb_ops_osigevent = {
 	.aio_copyin = aiocb_copyin_old_sigevent,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 #endif
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 	u_short evflags;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	ops->store_status(ujob, -1);
 	ops->store_error(ujob, 0);
 	ops->store_kernelinfo(ujob, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= max_aio_queue_per_proc) {
 		ops->store_error(ujob, EAGAIN);
 		return (EAGAIN);
 	}
 
 	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	knlist_init_mtx(&job->klist, AIO_MTX(ki));
 
 	error = ops->aio_copyin(ujob, &job->uaiocb);
 	if (error) {
 		ops->store_error(ujob, error);
 		uma_zfree(aiocb_zone, job);
 		return (error);
 	}
 
 	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		ops->store_error(ujob, EINVAL);
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	ksiginfo_init(&job->ksi);
 
 	/* Save userspace address of the job info. */
 	job->ujob = ujob;
 
 	/* Get the opcode. */
 	if (type != LIO_NOP)
 		job->uaiocb.aio_lio_opcode = type;
 	opcode = job->uaiocb.aio_lio_opcode;
 
 	/*
 	 * Validate the opcode and fetch the file object for the specified
 	 * file descriptor.
 	 *
 	 * XXXRW: Moved the opcode validation up here so that we don't
 	 * retrieve a file descriptor without knowing what the capabiltity
 	 * should be.
 	 */
 	fd = job->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 		error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 		break;
 	case LIO_READ:
 		error = fget_read(td, fd, &cap_pread_rights, &fp);
 		break;
 	case LIO_SYNC:
 		error = fget(td, fd, &cap_fsync_rights, &fp);
 		break;
 	case LIO_MLOCK:
 		fp = NULL;
 		break;
 	case LIO_NOP:
 		error = fget(td, fd, &cap_no_rights, &fp);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error) {
 		uma_zfree(aiocb_zone, job);
 		ops->store_error(ujob, error);
 		return (error);
 	}
 
 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if ((opcode == LIO_READ || opcode == LIO_WRITE) &&
 	    job->uaiocb.aio_offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	job->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = ops->store_kernelinfo(ujob, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, job);
 		return (0);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	memset(&kev, 0, sizeof(kev));
 	kev.ident = (uintptr_t)job->ujob;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
 	kev.data = (intptr_t)job;
 	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, M_WAITOK);
 	if (error)
 		goto aqueue_fail;
 
 no_kqueue:
 
 	ops->store_error(ujob, EINPROGRESS);
 	job->uaiocb._aiocb_private.error = EINPROGRESS;
 	job->userproc = p;
 	job->cred = crhold(td->td_ucred);
 	job->jobflags = KAIOCB_QUEUEING;
 	job->lio = lj;
 
 	if (opcode == LIO_MLOCK) {
 		aio_schedule(job, aio_process_mlock);
 		error = 0;
 	} else if (fp->f_ops->fo_aio_queue == NULL)
 		error = aio_queue_file(fp, job);
 	else
 		error = fo_aio_queue(fp, job);
 	if (error)
 		goto aqueue_fail;
 
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_QUEUEING;
 	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	atomic_add_int(&num_queue_count, 1);
 	if (job->jobflags & KAIOCB_FINISHED) {
 		/*
 		 * The queue callback completed the request synchronously.
 		 * The bulk of the completion is deferred in that case
 		 * until this point.
 		 */
 		aio_bio_done_notify(p, job);
 	} else
 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
 	AIO_UNLOCK(ki);
 	return (0);
 
 aqueue_fail:
 	knlist_delete(&job->klist, curthread, 0);
 	if (fp)
 		fdrop(fp, td);
 	uma_zfree(aiocb_zone, job);
 	ops->store_error(ujob, error);
 	return (error);
 }
 
 static void
 aio_cancel_daemon_job(struct kaiocb *job)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&aio_jobs, job, list);
 	mtx_unlock(&aio_job_mtx);
 	aio_cancel(job);
 }
 
 void
 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
 		mtx_unlock(&aio_job_mtx);
 		aio_cancel(job);
 		return;
 	}
 	job->handle_fn = func;
 	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
 	aio_kick_nowait(job->userproc);
 	mtx_unlock(&aio_job_mtx);
 }
 
 static void
 aio_cancel_sync(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
 	AIO_UNLOCK(ki);
 	aio_cancel(job);
 }
 
 int
 aio_queue_file(struct file *fp, struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job2;
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 	bool safe;
 
 	ki = job->userproc->p_aioinfo;
 	error = aio_qbio(job->userproc, job);
 	if (error >= 0)
 		return (error);
 	safe = false;
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vp->v_type == VREG || vp->v_type == VDIR) {
 			mp = fp->f_vnode->v_mount;
 			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
 				safe = true;
 		}
 	}
 	if (!(safe || enable_aio_unsafe)) {
 		counted_warning(&unsafe_warningcnt,
 		    "is attempting to use unsafe AIO requests");
 		return (EOPNOTSUPP);
 	}
 
 	switch (job->uaiocb.aio_lio_opcode) {
 	case LIO_READ:
 	case LIO_WRITE:
 		aio_schedule(job, aio_process_rw);
 		error = 0;
 		break;
 	case LIO_SYNC:
 		AIO_LOCK(ki);
 		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
 			if (job2->fd_file == job->fd_file &&
 			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    job2->seqno < job->seqno) {
 				job2->jobflags |= KAIOCB_CHECKSYNC;
 				job->pending++;
 			}
 		}
 		if (job->pending != 0) {
 			if (!aio_set_cancel_function_locked(job,
 				aio_cancel_sync)) {
 				AIO_UNLOCK(ki);
 				aio_cancel(job);
 				return (0);
 			}
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 	}
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 static int
 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	long status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
 		if (job->ujob == ujob)
 			break;
 	}
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 int
 sys_aio_return(struct thread *td, struct aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 static int
 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
     struct timespec *ts)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *firstjob, *job;
 	int error, i, timo;
 
 	timo = 0;
 	if (ts) {
 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	if (njoblist == 0)
 		return (0);
 
 	AIO_LOCK(ki);
 	for (;;) {
 		firstjob = NULL;
 		error = 0;
 		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (job->ujob == ujoblist[i]) {
 					if (firstjob == NULL)
 						firstjob = job;
 					if (job->jobflags & KAIOCB_FINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (firstjob == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	return (error);
 }
 
 int
 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	int error;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
 	if (error == 0)
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	free(ujoblist, M_AIOS);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-bio aio operations not currently in progress.
  */
 int
 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct kaiocb *job, *jobn;
 	struct file *fp;
 	int error;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		if ((uap->fd == job->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == job->ujob))) {
 			if (aio_cancel_job(p, ki, job)) {
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 static int
 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 		if (job->ujob == ujob) {
 			if (job->jobflags & KAIOCB_FINISHED)
 				td->td_retval[0] =
 					job->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = ops->fetch_status(ujob);
 	if (status == -1) {
 		td->td_retval[0] = ops->fetch_error(ujob);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 int
 sys_aio_error(struct thread *td, struct aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
 int
 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
 }
 
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *job;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int error;
 	int nagain, nerror;
 	int i;
 
 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
 		return (EINVAL);
 
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (sig && (mode == LIO_NOWAIT)) {
 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			memset(&kev, 0, sizeof(kev));
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uacb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td,
 			    M_WAITOK);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nagain = 0;
 	nerror = 0;
 	for (i = 0; i < nent; i++) {
 		job = acb_list[i];
 		if (job != NULL) {
 			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
 			if (error == EAGAIN)
 				nagain++;
 			else if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 			    == LIOJ_SIGNAL
 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal,
 					    &lj->lioj_ksi);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	else if (nagain)
 		return (EAGAIN);
 	else
 		return (error);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent osig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode,
 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 		    &aiocb_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 /* syscall - list directed I/O (REALTIME) */
 int
 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig, sizeof(sig));
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
 		    nent, sigp, &aiocb_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 static void
 aio_biowakeup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	size_t nbytes;
 	int error, nblks;
 
 	/* Release mapping into kernel space. */
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 	if (job->pbuf) {
 		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
 		uma_zfree(pbuf_zone, job->pbuf);
 		job->pbuf = NULL;
 		atomic_subtract_int(&num_buf_aio, 1);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 	} else
 		atomic_subtract_int(&num_unmapped_aio, 1);
 	vm_page_unhold_pages(job->pages, job->npages);
 
 	bp = job->bp;
 	job->bp = NULL;
 	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
 	error = 0;
 	if (bp->bio_flags & BIO_ERROR)
 		error = bp->bio_error;
 	nblks = btodb(nbytes);
 	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
 		job->outblock += nblks;
 	else
 		job->inblock += nblks;
 
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, nbytes, 0);
 
 	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */
 static int
 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
     struct timespec *ts, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 	struct aiocb *ujob;
 	long error, status;
 	int timo;
 
 	ops->store_aiocb(ujobp, NULL);
 
 	if (ts == NULL) {
 		timo = 0;
 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
 		timo = -1;
 	} else {
 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	job = NULL;
 	AIO_LOCK(ki);
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		if (timo == -1) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		ujob = job->ujob;
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_aiocb(ujobp, ujob);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
 }
 
 static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
     struct aiocb_ops *ops)
 {
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
 	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
 }
 
 int
 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct kaiocb *job;
 
 	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The job pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_aio = job;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&job->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_aio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct kaiocb *job = kn->kn_ptr.p_aio;
 
 	kn->kn_data = job->uaiocb._aiocb_private.error;
 	if (!(job->jobflags & KAIOCB_FINISHED))
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob *lj;
 
 	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_lio = lj;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_lio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = kn->kn_ptr.p_lio;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 struct __aiocb_private32 {
 	int32_t	status;
 	int32_t	error;
 	uint32_t kernelinfo;
 };
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb32 {
 	int	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 } oaiocb32_t;
 #endif
 
 typedef struct aiocb32 {
 	int32_t	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	int	__spare__[2];
 	uint32_t __spare2__;
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
 } aiocb32_t;
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	CP(*osig, *nsig, sigev_notify);
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb32 job32;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_old_sigevent32(&job32.aio_sigevent,
 	    &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct aiocb32 job32;
 	int error;
 
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
 }
 
 static long
 aiocb32_fetch_status(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.status));
 }
 
 static long
 aiocb32_fetch_error(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.error));
 }
 
 static int
 aiocb32_store_status(struct aiocb *ujob, long status)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.status, status));
 }
 
 static int
 aiocb32_store_error(struct aiocb *ujob, long error)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.error, error));
 }
 
 static int
 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword32(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb32_ops = {
 	.aio_copyin = aiocb32_copyin,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb32_ops_osigevent = {
 	.aio_copyin = aiocb32_copyin_old_sigevent,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 #endif
 
 int
 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 int
 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	uint32_t *ujoblist32;
 	int error, i;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
 	ujoblist32 = (uint32_t *)ujoblist;
 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
 	    sizeof(ujoblist32[0]));
 	if (error == 0) {
 		for (i = uap->nent - 1; i >= 0; i--)
 			ujoblist[i] = PTRIN(ujoblist32[i]);
 
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	}
 	free(ujoblist, M_AIOS);
 	return (error);
 }
 
 int
 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_read(struct thread *td,
     struct freebsd6_freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_write(struct thread *td,
     struct freebsd6_freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_lio_listio(struct thread *td,
     struct freebsd6_freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent32 osig;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent32(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 int
 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct sigevent32 sig32;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig32, sizeof(sig32));
 		if (error)
 			return (error);
 		error = convert_sigevent32(&sig32, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 #endif
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 367333)
+++ head/sys/vm/vm_glue.c	(revision 367334)
@@ -1,603 +1,604 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domainset.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
+#include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 #include <sys/vmem.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_domainset.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <machine/cpu.h>
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
 	if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjunction with this call.
  */
 int
 useracc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_user_wired)
 		return (ENOMEM);
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	if (error == KERN_SUCCESS) {
 		curthread->td_vslock_sz += len;
 		return (0);
 	}
 
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	MPASS(curthread->td_vslock_sz >= len);
 	curthread->td_vslock_sz -= len;
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Pin the page contained within the given object at the given offset.  If the
  * page is not resident, allocate and load it using the given object's pager.
  * Return the pinned page if successful; otherwise, return NULL.
  */
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 
 	pindex = OFF_TO_IDX(offset);
 	(void)vm_page_grab_valid_unlocked(&m, object, pindex,
 	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
 	return (m);
 }
 
 /*
  * Return a CPU private mapping to the page at the given offset within the
  * given object.  The page is pinned before it is mapped.
  */
 struct sf_buf *
 vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 
 	m = vm_imgact_hold_page(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
 }
 
 /*
  * Destroy the given CPU private mapping and unpin the page that it mapped.
  */
 void
 vm_imgact_unmap_page(struct sf_buf *sf)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
 	vm_page_unwire(m, PQ_ACTIVE);
 }
 
 void
 vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
 {
 
 	pmap_sync_icache(map->pmap, va, sz);
 }
 
 vm_object_t kstack_object;
 static uma_zone_t kstack_cache;
 static int kstack_cache_size;
 
 static int
 sysctl_kstack_cache_size(SYSCTL_HANDLER_ARGS)
 {
 	int error, oldsize;
 
 	oldsize = kstack_cache_size;
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0 && req->newptr && oldsize != kstack_cache_size)
 		uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
 	return (error);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kstack_cache_size,
     CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &kstack_cache_size, 0,
     sysctl_kstack_cache_size, "IU", "Maximum number of cached kernel stacks");
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  */
 static vm_offset_t
 vm_thread_stack_create(struct domainset *ds, int pages)
 {
 	vm_page_t ma[KSTACK_MAX_PAGES];
 	vm_offset_t ks;
 	int i;
 
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 #if defined(__mips__)
 	/*
 	 * We need to align the kstack's mapped address to fit within
 	 * a single TLB entry.
 	 */
 	if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE,
 	    PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    M_BESTFIT | M_NOWAIT, &ks)) {
 		ks = 0;
 	}
 #else
 	ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 #endif
 	if (ks == 0) {
 		printf("%s: kstack allocation failed\n", __func__);
 		return (0);
 	}
 
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
 
 	/*
 	 * Allocate physical pages to back the stack.
 	 */
 	vm_thread_stack_back(ds, ks, ma, pages, VM_ALLOC_NORMAL);
 	for (i = 0; i < pages; i++)
 		vm_page_valid(ma[i]);
 	pmap_qenter(ks, ma, pages);
 
 	return (ks);
 }
 
 static void
 vm_thread_stack_dispose(vm_offset_t ks, int pages)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int i;
 
 	pindex = atop(ks - VM_MIN_KERNEL_ADDRESS);
 
 	pmap_qremove(ks, pages);
 	VM_OBJECT_WLOCK(kstack_object);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(kstack_object, pindex + i);
 		if (m == NULL)
 			panic("%s: kstack already missing?", __func__);
 		vm_page_xbusy_claim(m);
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 	VM_OBJECT_WUNLOCK(kstack_object);
 	kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
 /*
  * Allocate the kernel stack for a new thread.
  */
 int
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_offset_t ks;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = kstack_pages;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 
 	ks = 0;
 	if (pages == kstack_pages && kstack_cache != NULL)
 		ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT);
 
 	/*
 	 * Ensure that kstack objects can draw pages from any memory
 	 * domain.  Otherwise a local memory shortage can block a process
 	 * swap-in.
 	 */
 	if (ks == 0)
 		ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)),
 		    pages);
 	if (ks == 0)
 		return (0);
 	td->td_kstack = ks;
 	td->td_kstack_pages = pages;
 	return (1);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_offset_t ks;
 	int pages;
 
 	pages = td->td_kstack_pages;
 	ks = td->td_kstack;
 	td->td_kstack = 0;
 	td->td_kstack_pages = 0;
 	if (pages == kstack_pages)
 		uma_zfree(kstack_cache, (void *)ks);
 	else
 		vm_thread_stack_dispose(ks, pages);
 }
 
 /*
  * Allocate physical pages, following the specified NUMA policy, to back a
  * kernel stack.
  */
 void
 vm_thread_stack_back(struct domainset *ds, vm_offset_t ks, vm_page_t ma[],
     int npages, int req_class)
 {
 	vm_pindex_t pindex;
 	int n;
 
 	pindex = atop(ks - VM_MIN_KERNEL_ADDRESS);
 
 	VM_OBJECT_WLOCK(kstack_object);
 	for (n = 0; n < npages;) {
 		if (vm_ndomains > 1)
 			kstack_object->domain.dr_policy = ds;
 
 		/*
 		 * Use WAITFAIL to force a reset of the domain selection policy
 		 * if we had to sleep for pages.
 		 */
 		n += vm_page_grab_pages(kstack_object, pindex + n,
 		    req_class | VM_ALLOC_WIRED | VM_ALLOC_WAITFAIL,
 		    &ma[n], npages - n);
 	}
 	VM_OBJECT_WUNLOCK(kstack_object);
 }
 
 static int
 kstack_import(void *arg, void **store, int cnt, int domain, int flags)
 {
 	struct domainset *ds;
 	int i;
 
 	if (domain == UMA_ANYDOMAIN)
 		ds = DOMAINSET_RR();
 	else
 		ds = DOMAINSET_PREF(domain);
 
 	for (i = 0; i < cnt; i++) {
 		store[i] = (void *)vm_thread_stack_create(ds, kstack_pages);
 		if (store[i] == NULL)
 			break;
 	}
 	return (i);
 }
 
 static void
 kstack_release(void *arg, void **store, int cnt)
 {
 	vm_offset_t ks;
 	int i;
 
 	for (i = 0; i < cnt; i++) {
 		ks = (vm_offset_t)store[i];
 		vm_thread_stack_dispose(ks, kstack_pages);
 	}
 }
 
 static void
 kstack_cache_init(void *null)
 {
 	kstack_object = vm_object_allocate(OBJT_SWAP,
 	    atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
 	kstack_cache = uma_zcache_create("kstack_cache",
 	    kstack_pages * PAGE_SIZE, NULL, NULL, NULL, NULL,
 	    kstack_import, kstack_release, NULL,
 	    UMA_ZONE_FIRSTTOUCH);
 	kstack_cache_size = imax(128, mp_ncpus * 4);
 	uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
 }
 SYSINIT(vm_kstacks, SI_SUB_KMEM, SI_ORDER_ANY, kstack_cache_init, NULL);
 
 #ifdef KSTACK_USAGE_PROF
 /*
  * Track maximum stack used by a thread in kernel.
  */
 static int max_kstack_used;
 
 SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
     &max_kstack_used, 0,
     "Maxiumum stack depth used by a thread in kernel");
 
 void
 intr_prof_stack_use(struct thread *td, struct trapframe *frame)
 {
 	vm_offset_t stack_top;
 	vm_offset_t current;
 	int used, prev_used;
 
 	/*
 	 * Testing for interrupted kernel mode isn't strictly
 	 * needed. It optimizes the execution, since interrupts from
 	 * usermode will have only the trap frame on the stack.
 	 */
 	if (TRAPF_USERMODE(frame))
 		return;
 
 	stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
 	current = (vm_offset_t)(uintptr_t)&stack_top;
 
 	/*
 	 * Try to detect if interrupt is using kernel thread stack.
 	 * Hardware could use a dedicated stack for interrupt handling.
 	 */
 	if (stack_top <= current || current < td->td_kstack)
 		return;
 
 	used = stack_top - current;
 	for (;;) {
 		prev_used = max_kstack_used;
 		if (prev_used >= used)
 			break;
 		if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
 			break;
 	}
 }
 #endif /* KSTACK_USAGE_PROF */
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 int
 vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
     struct vmspace *vm2, int flags)
 {
 	struct proc *p1 = td->td_proc;
 	struct domainset *dset;
 	int error;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
-			if (p1->p_vmspace->vm_refcnt > 1) {
+			if (refcount_load(&p1->p_vmspace->vm_refcnt) > 1) {
 				error = vmspace_unshare(p1);
 				if (error)
 					return (error);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
 		return (0);
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
-		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
+		refcount_acquire(&p1->p_vmspace->vm_refcnt);
 	}
 	dset = td2->td_domain.dr_policy;
 	while (vm_page_count_severe_set(&dset->ds_mask)) {
 		vm_wait_doms(&dset->ds_mask, 0);
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vm2;
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
 	return (0);
 }
 
 /*
  * Called after process has been wait(2)'ed upon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 void
 kick_proc0(void)
 {
 
 	wakeup(&proc0);
 }
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c	(revision 367333)
+++ head/sys/vm/vm_map.c	(revision 367334)
@@ -1,5277 +1,5263 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/elf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/file.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a self-adjusting binary search tree of these
  *	entries is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t vmspace_zone;
 static int vmspace_zinit(void *mem, int size, int flags);
 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
     vm_offset_t max);
 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
     vm_map_entry_t gap_entry);
 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 #ifdef INVARIANTS
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
     int cow);
 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr);
 
 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 
 	vm = (struct vmspace *)mem;
 	map = &vm->vm_map;
 
 	memset(map, 0, sizeof(*map));
 	mtx_init(&map->system_mtx, "vm map (system)", NULL,
 	    MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 	PMAP_LOCK_INIT(vmspace_pmap(vm));
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 	KASSERT(vm->vm_map.nentries == 0,
 	    ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
 	KASSERT(vm->vm_map.size == 0,
 	    ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  */
 struct vmspace *
 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
 	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
 	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
-	vm->vm_refcnt = 1;
+	refcount_init(&vm->vm_refcnt, 1);
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
 #ifdef RACCT
 static void
 vmspace_container_reset(struct proc *p)
 {
 
 	PROC_LOCK(p);
 	racct_set(p, RACCT_DATA, 0);
 	racct_set(p, RACCT_STACK, 0);
 	racct_set(p, RACCT_RSS, 0);
 	racct_set(p, RACCT_MEMLOCK, 0);
 	racct_set(p, RACCT_VMEM, 0);
 	PROC_UNLOCK(p);
 }
 #endif
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
 	    vm_map_max(&vm->vm_map));
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "vmspace_free() called");
 
-	if (vm->vm_refcnt == 0)
-		panic("vmspace_free: attempt to free already freed vmspace");
-
-	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
+	if (refcount_release(&vm->vm_refcnt))
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
-	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
+	bool released;
 
-	/*
-	 * Release user portion of address space.
-	 * This releases references to vnodes,
-	 * which could cause I/O if the file has been unlinked.
-	 * Need to do this early enough that we can still sleep.
-	 *
-	 * The last exiting process to reach this point releases as
-	 * much of the environment as it can. vmspace_dofree() is the
-	 * slower fallback in case another process had a temporary
-	 * reference to the vmspace.
-	 */
-
 	p = td->td_proc;
 	vm = p->p_vmspace;
-	atomic_add_int(&vmspace0.vm_refcnt, 1);
-	refcnt = vm->vm_refcnt;
-	do {
-		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
-			/* Switch now since other proc might free vmspace */
+
+	/*
+	 * Prepare to release the vmspace reference.  The thread that releases
+	 * the last reference is responsible for tearing down the vmspace.
+	 * However, threads not releasing the final reference must switch to the
+	 * kernel's vmspace0 before the decrement so that the subsequent pmap
+	 * deactivation does not modify a freed vmspace.
+	 */
+	refcount_acquire(&vmspace0.vm_refcnt);
+	if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
+		if (p->p_vmspace != &vmspace0) {
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
-	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
-	if (refcnt == 1) {
+		released = refcount_release(&vm->vm_refcnt);
+	}
+	if (released) {
+		/*
+		 * pmap_remove_pages() expects the pmap to be active, so switch
+		 * back first if necessary.
+		 */
 		if (p->p_vmspace != vm) {
-			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
-		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
 #ifdef RACCT
 	if (racct_enable)
 		vmspace_container_reset(p);
 #endif
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
-	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
-	if (vm == NULL) {
+	if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
-	refcnt = vm->vm_refcnt;
-	do {
-		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
-			PROC_VMSPACE_UNLOCK(p);
-			return (NULL);
-		}
-	} while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 /*
  * Switch between vmspaces in an AIO kernel process.
  *
  * The new vmspace is either the vmspace of a user process obtained
  * from an active AIO request or the initial vmspace of the AIO kernel
  * process (when it is idling).  Because user processes will block to
  * drain any active AIO requests before proceeding in exit() or
  * execve(), the reference count for vmspaces from AIO requests can
  * never be 0.  Similarly, AIO kernel processes hold an extra
  * reference on their initial vmspace for the life of the process.  As
  * a result, the 'newvm' vmspace always has a non-zero reference
  * count.  This permits an additional reference on 'newvm' to be
  * acquired via a simple atomic increment rather than the loop in
  * vmspace_acquire_ref() above.
  */
 void
 vmspace_switch_aio(struct vmspace *newvm)
 {
 	struct vmspace *oldvm;
 
 	/* XXX: Need some way to assert that this is an aio daemon. */
 
-	KASSERT(newvm->vm_refcnt > 0,
+	KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
 	    ("vmspace_switch_aio: newvm unreferenced"));
 
 	oldvm = curproc->p_vmspace;
 	if (oldvm == newvm)
 		return;
 
 	/*
 	 * Point to the new address space and refer to it.
 	 */
 	curproc->p_vmspace = newvm;
-	atomic_add_int(&newvm->vm_refcnt, 1);
+	refcount_acquire(&newvm->vm_refcnt);
 
 	/* Activate the new mapping. */
 	pmap_activate(curthread);
 
 	vmspace_free(oldvm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xlock_(&map->lock, file, line);
 	map->timestamp++;
 }
 
 void
 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
 {
 	vm_object_t object;
 	struct vnode *vp;
 	bool vp_held;
 
 	if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
 		return;
 	KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 	    ("Submap with execs"));
 	object = entry->object.vm_object;
 	KASSERT(object != NULL, ("No object for text, entry %p", entry));
 	if ((object->flags & OBJ_ANON) != 0)
 		object = object->handle;
 	else
 		KASSERT(object->backing_object == NULL,
 		    ("non-anon object %p shadows", object));
 	KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
 	    entry, entry->object.vm_object));
 
 	/*
 	 * Mostly, we do not lock the backing object.  It is
 	 * referenced by the entry we are processing, so it cannot go
 	 * away.
 	 */
 	vp = NULL;
 	vp_held = false;
 	if (object->type == OBJT_DEAD) {
 		/*
 		 * For OBJT_DEAD objects, v_writecount was handled in
 		 * vnode_pager_dealloc().
 		 */
 	} else if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 	} else if (object->type == OBJT_SWAP) {
 		KASSERT((object->flags & OBJ_TMPFS_NODE) != 0,
 		    ("vm_map_entry_set_vnode_text: swap and !TMPFS "
 		    "entry %p, object %p, add %d", entry, object, add));
 		/*
 		 * Tmpfs VREG node, which was reclaimed, has
 		 * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS.  In
 		 * this case there is no v_writecount to adjust.
 		 */
 		VM_OBJECT_RLOCK(object);
 		if ((object->flags & OBJ_TMPFS) != 0) {
 			vp = object->un_pager.swp.swp_tmpfs;
 			if (vp != NULL) {
 				vhold(vp);
 				vp_held = true;
 			}
 		}
 		VM_OBJECT_RUNLOCK(object);
 	} else {
 		KASSERT(0,
 		    ("vm_map_entry_set_vnode_text: wrong object type, "
 		    "entry %p, object %p, add %d", entry, object, add));
 	}
 	if (vp != NULL) {
 		if (add) {
 			VOP_SET_TEXT_CHECKED(vp);
 		} else {
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			VOP_UNSET_TEXT_CHECKED(vp);
 			VOP_UNLOCK(vp);
 		}
 		if (vp_held)
 			vdrop(vp);
 	}
 }
 
 /*
  * Use a different name for this vm_map_entry field when it's use
  * is not consistent with its use as part of an ordered search tree.
  */
 #define defer_next right
 
 static void
 vm_map_process_deferred(void)
 {
 	struct thread *td;
 	vm_map_entry_t entry, next;
 	vm_object_t object;
 
 	td = curthread;
 	entry = td->td_map_def_user;
 	td->td_map_def_user = NULL;
 	while (entry != NULL) {
 		next = entry->defer_next;
 		MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
 		    MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
 		    MAP_ENTRY_VN_EXEC));
 		if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
 			/*
 			 * Decrement the object's writemappings and
 			 * possibly the vnode's v_writecount.
 			 */
 			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 			    ("Submap with writecount"));
 			object = entry->object.vm_object;
 			KASSERT(object != NULL, ("No object for writecount"));
 			vm_pager_release_writecount(object, entry->start,
 			    entry->end);
 		}
 		vm_map_entry_set_vnode_text(entry, false);
 		vm_map_entry_deallocate(entry, FALSE);
 		entry = next;
 	}
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	else
 		sx_assert_(&map->lock, SA_XLOCKED, file, line);
 }
 
 #define	VM_MAP_ASSERT_LOCKED(map) \
     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
 
 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
 #ifdef DIAGNOSTIC
 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
 #else
 static int enable_vmmap_check = VMMAP_CHECK_NONE;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
     &enable_vmmap_check, 0, "Enable vm map consistency checking");
 
 static void _vm_map_assert_consistent(vm_map_t map, int check);
 
 #define VM_MAP_ASSERT_CONSISTENT(map) \
     _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
 #ifdef DIAGNOSTIC
 #define VM_MAP_UNLOCK_CONSISTENT(map) do {				\
 	if (map->nupdates > map->nentries) {				\
 		_vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK);	\
 		map->nupdates = 0;					\
 	}								\
 } while (0)
 #else
 #define VM_MAP_UNLOCK_CONSISTENT(map)
 #endif
 #else
 #define	VM_MAP_ASSERT_LOCKED(map)
 #define VM_MAP_ASSERT_CONSISTENT(map)
 #define VM_MAP_UNLOCK_CONSISTENT(map)
 #endif /* INVARIANTS */
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	VM_MAP_UNLOCK_CONSISTENT(map);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_xunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_lock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_slock_(&map->lock, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else {
 		sx_sunlock_(&map->lock, file, line);
 		vm_map_process_deferred();
 	}
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_xlock_(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 	    !sx_try_slock_(&map->lock, file, line);
 	return (error == 0);
 }
 
 /*
  *	_vm_map_lock_upgrade:	[ internal use only ]
  *
  *	Tries to upgrade a read (shared) lock on the specified map to a write
  *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
  *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
  *	returned without a read or write lock held.
  *
  *	Requires that the map be read locked.
  */
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 	unsigned int last_timestamp;
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		if (!sx_try_upgrade_(&map->lock, file, line)) {
 			last_timestamp = map->timestamp;
 			sx_sunlock_(&map->lock, file, line);
 			vm_map_process_deferred();
 			/*
 			 * If the map's timestamp does not change while the
 			 * map is unlocked, then the upgrade succeeds.
 			 */
 			sx_xlock_(&map->lock, file, line);
 			if (last_timestamp != map->timestamp) {
 				sx_xunlock_(&map->lock, file, line);
 				return (1);
 			}
 		}
 	}
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map) {
 		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 	} else {
 		VM_MAP_UNLOCK_CONSISTENT(map);
 		sx_downgrade_(&map->lock, file, line);
 	}
 }
 
 /*
  *	vm_map_locked:
  *
  *	Returns a non-zero value if the caller holds a write (exclusive) lock
  *	on the specified map and the value "0" otherwise.
  */
 int
 vm_map_locked(vm_map_t map)
 {
 
 	if (map->system_map)
 		return (mtx_owned(&map->system_mtx));
 	else
 		return (sx_xlocked(&map->lock));
 }
 
 /*
  *	_vm_map_unlock_and_wait:
  *
  *	Atomically releases the lock on the specified map and puts the calling
  *	thread to sleep.  The calling thread will remain asleep until either
  *	vm_map_wakeup() is performed on the map or the specified timeout is
  *	exceeded.
  *
  *	WARNING!  This function does not perform deferred deallocations of
  *	objects and map	entries.  Therefore, the calling thread is expected to
  *	reacquire the map lock after reawakening and later perform an ordinary
  *	unlock operation, such as vm_map_unlock(), before completing its
  *	operation on the map.
  */
 int
 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
 {
 
 	VM_MAP_UNLOCK_CONSISTENT(map);
 	mtx_lock(&map_sleep_mtx);
 	if (map->system_map)
 		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 	else
 		sx_xunlock_(&map->lock, file, line);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 	    timo));
 }
 
 /*
  *	vm_map_wakeup:
  *
  *	Awaken any threads that have slept on the map using
  *	vm_map_unlock_and_wait().
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the map unlock
 	 * and the msleep() in _vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 void
 vm_map_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	map->busy++;
 }
 
 void
 vm_map_unbusy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
 	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
 		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
 		wakeup(&map->busy);
 	}
 }
 
 void 
 vm_map_wait_busy(vm_map_t map)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	while (map->busy) {
 		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
 		if (map->system_map)
 			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
 		else
 			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
 	}
 	map->timestamp++;
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  */
 static void
 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.eflags = MAP_ENTRY_HEADER;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
 	map->header.end = min;
 	map->header.start = max;
 	map->flags = 0;
 	map->header.left = map->header.right = &map->header;
 	map->root = NULL;
 	map->timestamp = 0;
 	map->busy = 0;
 	map->anon_loc = 0;
 #ifdef DIAGNOSTIC
 	map->nupdates = 0;
 #endif
 }
 
 void
 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 
 	_vm_map_init(map, pmap, min, max);
 	mtx_init(&map->system_mtx, "vm map (system)", NULL,
 	    MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_max_free_{left,right}:
  *
  *	Compute the size of the largest free gap between two entries,
  *	one the root of a tree and the other the ancestor of that root
  *	that is the least or greatest ancestor found on the search path.
  */
 static inline vm_size_t
 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
 {
 
 	return (root->left != left_ancestor ?
 	    root->left->max_free : root->start - left_ancestor->end);
 }
 
 static inline vm_size_t
 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
 {
 
 	return (root->right != right_ancestor ?
 	    root->right->max_free : right_ancestor->start - root->end);
 }
 
 /*
  *	vm_map_entry_{pred,succ}:
  *
  *	Find the {predecessor, successor} of the entry by taking one step
  *	in the appropriate direction and backtracking as much as necessary.
  *	vm_map_entry_succ is defined in vm_map.h.
  */
 static inline vm_map_entry_t
 vm_map_entry_pred(vm_map_entry_t entry)
 {
 	vm_map_entry_t prior;
 
 	prior = entry->left;
 	if (prior->right->start < entry->start) {
 		do
 			prior = prior->right;
 		while (prior->right != entry);
 	}
 	return (prior);
 }
 
 static inline vm_size_t
 vm_size_max(vm_size_t a, vm_size_t b)
 {
 
 	return (a > b ? a : b);
 }
 
 #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do {		\
 	vm_map_entry_t z;						\
 	vm_size_t max_free;						\
 									\
 	/*								\
 	 * Infer root->right->max_free == root->max_free when		\
 	 * y->max_free < root->max_free || root->max_free == 0.		\
 	 * Otherwise, look right to find it.				\
 	 */								\
 	y = root->left;							\
 	max_free = root->max_free;					\
 	KASSERT(max_free == vm_size_max(				\
 	    vm_map_entry_max_free_left(root, llist),			\
 	    vm_map_entry_max_free_right(root, rlist)),			\
 	    ("%s: max_free invariant fails", __func__));		\
 	if (max_free - 1 < vm_map_entry_max_free_left(root, llist))	\
 		max_free = vm_map_entry_max_free_right(root, rlist);	\
 	if (y != llist && (test)) {					\
 		/* Rotate right and make y root. */			\
 		z = y->right;						\
 		if (z != root) {					\
 			root->left = z;					\
 			y->right = root;				\
 			if (max_free < y->max_free)			\
 			    root->max_free = max_free =			\
 			    vm_size_max(max_free, z->max_free);		\
 		} else if (max_free < y->max_free)			\
 			root->max_free = max_free =			\
 			    vm_size_max(max_free, root->start - y->end);\
 		root = y;						\
 		y = root->left;						\
 	}								\
 	/* Copy right->max_free.  Put root on rlist. */			\
 	root->max_free = max_free;					\
 	KASSERT(max_free == vm_map_entry_max_free_right(root, rlist),	\
 	    ("%s: max_free not copied from right", __func__));		\
 	root->left = rlist;						\
 	rlist = root;							\
 	root = y != llist ? y : NULL;					\
 } while (0)
 
 #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do {		\
 	vm_map_entry_t z;						\
 	vm_size_t max_free;						\
 									\
 	/*								\
 	 * Infer root->left->max_free == root->max_free when		\
 	 * y->max_free < root->max_free || root->max_free == 0.		\
 	 * Otherwise, look left to find it.				\
 	 */								\
 	y = root->right;						\
 	max_free = root->max_free;					\
 	KASSERT(max_free == vm_size_max(				\
 	    vm_map_entry_max_free_left(root, llist),			\
 	    vm_map_entry_max_free_right(root, rlist)),			\
 	    ("%s: max_free invariant fails", __func__));		\
 	if (max_free - 1 < vm_map_entry_max_free_right(root, rlist))	\
 		max_free = vm_map_entry_max_free_left(root, llist);	\
 	if (y != rlist && (test)) {					\
 		/* Rotate left and make y root. */			\
 		z = y->left;						\
 		if (z != root) {					\
 			root->right = z;				\
 			y->left = root;					\
 			if (max_free < y->max_free)			\
 			    root->max_free = max_free =			\
 			    vm_size_max(max_free, z->max_free);		\
 		} else if (max_free < y->max_free)			\
 			root->max_free = max_free =			\
 			    vm_size_max(max_free, y->start - root->end);\
 		root = y;						\
 		y = root->right;					\
 	}								\
 	/* Copy left->max_free.  Put root on llist. */			\
 	root->max_free = max_free;					\
 	KASSERT(max_free == vm_map_entry_max_free_left(root, llist),	\
 	    ("%s: max_free not copied from left", __func__));		\
 	root->right = llist;						\
 	llist = root;							\
 	root = y != rlist ? y : NULL;					\
 } while (0)
 
 /*
  * Walk down the tree until we find addr or a gap where addr would go, breaking
  * off left and right subtrees of nodes less than, or greater than addr.  Treat
  * subtrees with root->max_free < length as empty trees.  llist and rlist are
  * the two sides in reverse order (bottom-up), with llist linked by the right
  * pointer and rlist linked by the left pointer in the vm_map_entry, and both
  * lists terminated by &map->header.  This function, and the subsequent call to
  * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
  * values in &map->header.
  */
 static __always_inline vm_map_entry_t
 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
     vm_map_entry_t *llist, vm_map_entry_t *rlist)
 {
 	vm_map_entry_t left, right, root, y;
 
 	left = right = &map->header;
 	root = map->root;
 	while (root != NULL && root->max_free >= length) {
 		KASSERT(left->end <= root->start &&
 		    root->end <= right->start,
 		    ("%s: root not within tree bounds", __func__));
 		if (addr < root->start) {
 			SPLAY_LEFT_STEP(root, y, left, right,
 			    y->max_free >= length && addr < y->start);
 		} else if (addr >= root->end) {
 			SPLAY_RIGHT_STEP(root, y, left, right,
 			    y->max_free >= length && addr >= y->end);
 		} else
 			break;
 	}
 	*llist = left;
 	*rlist = right;
 	return (root);
 }
 
 static __always_inline void
 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
 {
 	vm_map_entry_t hi, right, y;
 
 	right = *rlist;
 	hi = root->right == right ? NULL : root->right;
 	if (hi == NULL)
 		return;
 	do
 		SPLAY_LEFT_STEP(hi, y, root, right, true);
 	while (hi != NULL);
 	*rlist = right;
 }
 
 static __always_inline void
 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
 {
 	vm_map_entry_t left, lo, y;
 
 	left = *llist;
 	lo = root->left == left ? NULL : root->left;
 	if (lo == NULL)
 		return;
 	do
 		SPLAY_RIGHT_STEP(lo, y, left, root, true);
 	while (lo != NULL);
 	*llist = left;
 }
 
 static inline void
 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
 {
 	vm_map_entry_t tmp;
 
 	tmp = *b;
 	*b = *a;
 	*a = tmp;
 }
 
 /*
  * Walk back up the two spines, flip the pointers and set max_free.  The
  * subtrees of the root go at the bottom of llist and rlist.
  */
 static vm_size_t
 vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
 {
 	do {
 		/*
 		 * The max_free values of the children of llist are in
 		 * llist->max_free and max_free.  Update with the
 		 * max value.
 		 */
 		llist->max_free = max_free =
 		    vm_size_max(llist->max_free, max_free);
 		vm_map_entry_swap(&llist->right, &tail);
 		vm_map_entry_swap(&tail, &llist);
 	} while (llist != header);
 	root->left = tail;
 	return (max_free);
 }
 
 /*
  * When llist is known to be the predecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t llist)
 {
 	vm_size_t max_free;
 
 	max_free = root->start - llist->end;
 	if (llist != header) {
 		max_free = vm_map_splay_merge_left_walk(header, root,
 		    root, max_free, llist);
 	} else {
 		root->left = header;
 		header->right = root;
 	}
 	return (max_free);
 }
 
 /*
  * When llist may or may not be the predecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t llist)
 {
 	vm_size_t max_free;
 
 	max_free = vm_map_entry_max_free_left(root, llist);
 	if (llist != header) {
 		max_free = vm_map_splay_merge_left_walk(header, root,
 		    root->left == llist ? root : root->left,
 		    max_free, llist);
 	}
 	return (max_free);
 }
 
 static vm_size_t
 vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
 {
 	do {
 		/*
 		 * The max_free values of the children of rlist are in
 		 * rlist->max_free and max_free.  Update with the
 		 * max value.
 		 */
 		rlist->max_free = max_free =
 		    vm_size_max(rlist->max_free, max_free);
 		vm_map_entry_swap(&rlist->left, &tail);
 		vm_map_entry_swap(&tail, &rlist);
 	} while (rlist != header);
 	root->right = tail;
 	return (max_free);
 }
 
 /*
  * When rlist is known to be the succecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t rlist)
 {
 	vm_size_t max_free;
 
 	max_free = rlist->start - root->end;
 	if (rlist != header) {
 		max_free = vm_map_splay_merge_right_walk(header, root,
 		    root, max_free, rlist);
 	} else {
 		root->right = header;
 		header->left = root;
 	}
 	return (max_free);
 }
 
 /*
  * When rlist may or may not be the succecessor of root.
  */
 static inline vm_size_t
 vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
     vm_map_entry_t rlist)
 {
 	vm_size_t max_free;
 
 	max_free = vm_map_entry_max_free_right(root, rlist);
 	if (rlist != header) {
 		max_free = vm_map_splay_merge_right_walk(header, root,
 		    root->right == rlist ? root : root->right,
 		    max_free, rlist);
 	}
 	return (max_free);
 }
 
 /*
  *	vm_map_splay:
  *
  *	The Sleator and Tarjan top-down splay algorithm with the
  *	following variation.  Max_free must be computed bottom-up, so
  *	on the downward pass, maintain the left and right spines in
  *	reverse order.  Then, make a second pass up each side to fix
  *	the pointers and compute max_free.  The time bound is O(log n)
  *	amortized.
  *
  *	The tree is threaded, which means that there are no null pointers.
  *	When a node has no left child, its left pointer points to its
  *	predecessor, which the last ancestor on the search path from the root
  *	where the search branched right.  Likewise, when a node has no right
  *	child, its right pointer points to its successor.  The map header node
  *	is the predecessor of the first map entry, and the successor of the
  *	last.
  *
  *	The new root is the vm_map_entry containing "addr", or else an
  *	adjacent entry (lower if possible) if addr is not in the tree.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: the new root.
  */
 static vm_map_entry_t
 vm_map_splay(vm_map_t map, vm_offset_t addr)
 {
 	vm_map_entry_t header, llist, rlist, root;
 	vm_size_t max_free_left, max_free_right;
 
 	header = &map->header;
 	root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
 	if (root != NULL) {
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else if (llist != header) {
 		/*
 		 * Recover the greatest node in the left
 		 * subtree and make it the root.
 		 */
 		root = llist;
 		llist = root->right;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_succ(header, root, rlist);
 	} else if (rlist != header) {
 		/*
 		 * Recover the least node in the right
 		 * subtree and make it the root.
 		 */
 		root = rlist;
 		rlist = root->left;
 		max_free_left = vm_map_splay_merge_pred(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else {
 		/* There is no root. */
 		return (NULL);
 	}
 	root->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	return (root);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.  On linking, if new entry clips
  *	existing entry, trim existing entry to avoid overlap, and manage
  *	offsets.  On unlinking, merge disappearing entry with neighbor, if
  *	called for, and manage offsets.  Callers should not modify fields in
  *	entries already mapped.
  */
 static void
 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t header, llist, rlist, root;
 	vm_size_t max_free_left, max_free_right;
 
 	CTR3(KTR_VM,
 	    "vm_map_entry_link: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 	VM_MAP_ASSERT_LOCKED(map);
 	map->nentries++;
 	header = &map->header;
 	root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
 	if (root == NULL) {
 		/*
 		 * The new entry does not overlap any existing entry in the
 		 * map, so it becomes the new root of the map tree.
 		 */
 		max_free_left = vm_map_splay_merge_pred(header, entry, llist);
 		max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
 	} else if (entry->start == root->start) {
 		/*
 		 * The new entry is a clone of root, with only the end field
 		 * changed.  The root entry will be shrunk to abut the new
 		 * entry, and will be the right child of the new root entry in
 		 * the modified map.
 		 */
 		KASSERT(entry->end < root->end,
 		    ("%s: clip_start not within entry", __func__));
 		vm_map_splay_findprev(root, &llist);
 		root->offset += entry->end - root->start;
 		root->start = entry->end;
 		max_free_left = vm_map_splay_merge_pred(header, entry, llist);
 		max_free_right = root->max_free = vm_size_max(
 		    vm_map_splay_merge_pred(entry, root, entry),
 		    vm_map_splay_merge_right(header, root, rlist));
 	} else {
 		/*
 		 * The new entry is a clone of root, with only the start field
 		 * changed.  The root entry will be shrunk to abut the new
 		 * entry, and will be the left child of the new root entry in
 		 * the modified map.
 		 */
 		KASSERT(entry->end == root->end,
 		    ("%s: clip_start not within entry", __func__));
 		vm_map_splay_findnext(root, &rlist);
 		entry->offset += entry->start - root->start;
 		root->end = entry->start;
 		max_free_left = root->max_free = vm_size_max(
 		    vm_map_splay_merge_left(header, root, llist),
 		    vm_map_splay_merge_succ(entry, root, entry));
 		max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
 	}
 	entry->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = entry;
 	VM_MAP_ASSERT_CONSISTENT(map);
 }
 
 enum unlink_merge_type {
 	UNLINK_MERGE_NONE,
 	UNLINK_MERGE_NEXT
 };
 
 static void
 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
     enum unlink_merge_type op)
 {
 	vm_map_entry_t header, llist, rlist, root;
 	vm_size_t max_free_left, max_free_right;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	header = &map->header;
 	root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
 	KASSERT(root != NULL,
 	    ("vm_map_entry_unlink: unlink object not mapped"));
 
 	vm_map_splay_findprev(root, &llist);
 	vm_map_splay_findnext(root, &rlist);
 	if (op == UNLINK_MERGE_NEXT) {
 		rlist->start = root->start;
 		rlist->offset = root->offset;
 	}
 	if (llist != header) {
 		root = llist;
 		llist = root->right;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_succ(header, root, rlist);
 	} else if (rlist != header) {
 		root = rlist;
 		rlist = root->left;
 		max_free_left = vm_map_splay_merge_pred(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else {
 		header->left = header->right = header;
 		root = NULL;
 	}
 	if (root != NULL)
 		root->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize:
  *
  *	Resize a vm_map_entry, recompute the amount of free space that
  *	follows it and propagate that value up the tree.
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
 {
 	vm_map_entry_t header, llist, rlist, root;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	header = &map->header;
 	root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
 	KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
 	vm_map_splay_findnext(root, &rlist);
 	entry->end += grow_amount;
 	root->max_free = vm_size_max(
 	    vm_map_splay_merge_left(header, root, llist),
 	    vm_map_splay_merge_succ(header, root, rlist));
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
 	    __func__, map, map->nentries, entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
 	vm_map_entry_t cur, header, lbound, ubound;
 	boolean_t locked;
 
 	/*
 	 * If the map is empty, then the map entry immediately preceding
 	 * "address" is the map's header.
 	 */
 	header = &map->header;
 	cur = map->root;
 	if (cur == NULL) {
 		*entry = header;
 		return (FALSE);
 	}
 	if (address >= cur->start && cur->end > address) {
 		*entry = cur;
 		return (TRUE);
 	}
 	if ((locked = vm_map_locked(map)) ||
 	    sx_try_upgrade(&map->lock)) {
 		/*
 		 * Splay requires a write lock on the map.  However, it only
 		 * restructures the binary search tree; it does not otherwise
 		 * change the map.  Thus, the map's timestamp need not change
 		 * on a temporary upgrade.
 		 */
 		cur = vm_map_splay(map, address);
 		if (!locked) {
 			VM_MAP_UNLOCK_CONSISTENT(map);
 			sx_downgrade(&map->lock);
 		}
 
 		/*
 		 * If "address" is contained within a map entry, the new root
 		 * is that map entry.  Otherwise, the new root is a map entry
 		 * immediately before or after "address".
 		 */
 		if (address < cur->start) {
 			*entry = header;
 			return (FALSE);
 		}
 		*entry = cur;
 		return (address < cur->end);
 	}
 	/*
 	 * Since the map is only locked for read access, perform a
 	 * standard binary search tree lookup for "address".
 	 */
 	lbound = ubound = header;
 	for (;;) {
 		if (address < cur->start) {
 			ubound = cur;
 			cur = cur->left;
 			if (cur == lbound)
 				break;
 		} else if (cur->end <= address) {
 			lbound = cur;
 			cur = cur->right;
 			if (cur == ubound)
 				break;
 		} else {
 			*entry = cur;
 			return (TRUE);
 		}
 	}
 	*entry = lbound;
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, next_entry, prev_entry;
 	struct ucred *cred;
 	vm_eflags_t protoeflags;
 	vm_inherit_t inheritance;
 	u_long bdry;
 	u_int bidx;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(object != kernel_object ||
 	    (cow & MAP_COPY_ON_WRITE) == 0,
 	    ("vm_map_insert: kernel object and COW"));
 	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
 	    (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
 	    ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
 	    object, cow));
 	KASSERT((prot & ~max) == 0,
 	    ("prot %#x is not subset of max_prot %#x", prot, max));
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if (start == end || !vm_map_range_valid(map, start, end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &prev_entry))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	next_entry = vm_map_entry_succ(prev_entry);
 	if (next_entry->start < end)
 		return (KERN_NO_SPACE);
 
 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
 	    max != VM_PROT_NONE))
 		return (KERN_INVALID_ARGUMENT);
 
 	protoeflags = 0;
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
 	if (cow & MAP_NOFAULT)
 		protoeflags |= MAP_ENTRY_NOFAULT;
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 	if (cow & MAP_STACK_GROWS_DOWN)
 		protoeflags |= MAP_ENTRY_GROWS_DOWN;
 	if (cow & MAP_STACK_GROWS_UP)
 		protoeflags |= MAP_ENTRY_GROWS_UP;
 	if (cow & MAP_WRITECOUNT)
 		protoeflags |= MAP_ENTRY_WRITECNT;
 	if (cow & MAP_VN_EXEC)
 		protoeflags |= MAP_ENTRY_VN_EXEC;
 	if ((cow & MAP_CREATE_GUARD) != 0)
 		protoeflags |= MAP_ENTRY_GUARD;
 	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
 		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
 	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
 		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
 	if (cow & MAP_INHERIT_SHARE)
 		inheritance = VM_INHERIT_SHARE;
 	else
 		inheritance = VM_INHERIT_DEFAULT;
 	if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
 		/* This magically ignores index 0, for usual page size. */
 		bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
 		    MAP_SPLIT_BOUNDARY_SHIFT;
 		if (bidx >= MAXPAGESIZES)
 			return (KERN_INVALID_ARGUMENT);
 		bdry = pagesizes[bidx] - 1;
 		if ((start & bdry) != 0 || (end & bdry) != 0)
 			return (KERN_INVALID_ARGUMENT);
 		protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
 	}
 
 	cred = NULL;
 	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
 		goto charged;
 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
 			return (KERN_RESOURCE_SHORTAGE);
 		KASSERT(object == NULL ||
 		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
 		    object->cred == NULL,
 		    ("overcommit: vm_map_insert o %p", object));
 		cred = curthread->td_ucred;
 	}
 
 charged:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map && end > kernel_vm_end)
 		pmap_growkernel(end);
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		if ((object->flags & OBJ_ANON) != 0) {
 			VM_OBJECT_WLOCK(object);
 			if (object->ref_count > 1 || object->shadow_count != 0)
 				vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			VM_OBJECT_WUNLOCK(object);
 		}
 	} else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
 	    protoeflags &&
 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
 	    MAP_VN_EXEC)) == 0 &&
 	    prev_entry->end == start && (prev_entry->cred == cred ||
 	    (prev_entry->object.vm_object != NULL &&
 	    prev_entry->object.vm_object->cred == cred)) &&
 	    vm_object_coalesce(prev_entry->object.vm_object,
 	    prev_entry->offset,
 	    (vm_size_t)(prev_entry->end - prev_entry->start),
 	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
 	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if (prev_entry->inheritance == inheritance &&
 		    prev_entry->protection == prot &&
 		    prev_entry->max_protection == max &&
 		    prev_entry->wired_count == 0) {
 			KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
 			    0, ("prev_entry %p has incoherent wiring",
 			    prev_entry));
 			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
 				map->size += end - prev_entry->end;
 			vm_map_entry_resize(map, prev_entry,
 			    end - prev_entry->end);
 			vm_map_try_merge_entries(map, prev_entry, next_entry);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 		    (prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 		if (cred != NULL && object != NULL && object->cred != NULL &&
 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			/* Object already accounts for this uid. */
 			cred = NULL;
 		}
 	}
 	if (cred != NULL)
 		crhold(cred);
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 	new_entry->cred = NULL;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 
 	new_entry->inheritance = inheritance;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
 	new_entry->next_read = start;
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
 	new_entry->cred = cred;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, new_entry);
 	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * Try to coalesce the new entry with both the previous and next
 	 * entries in the list.  Previously, we only attempted to coalesce
 	 * with the previous entry when object is NULL.  Here, we handle the
 	 * other cases, which are less common.
 	 */
 	vm_map_try_merge_entries(map, prev_entry, new_entry);
 	vm_map_try_merge_entries(map, new_entry, next_entry);
 
 	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
 		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
 		    end - start, cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
  *	In a vm_map_entry, "max_free" is the maximum amount of
  *	contiguous free space between an entry in its subtree and a
  *	neighbor of that entry.  This allows finding a free region in
  *	one path down the tree, so O(log n) amortized with splay
  *	trees.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: starting address if sufficient space,
  *		 vm_map_max(map)-length+1 if insufficient space.
  */
 vm_offset_t
 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
 {
 	vm_map_entry_t header, llist, rlist, root, y;
 	vm_size_t left_length, max_free_left, max_free_right;
 	vm_offset_t gap_end;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	start = MAX(start, vm_map_min(map));
 	if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
 		return (vm_map_max(map) - length + 1);
 
 	/* Empty tree means wide open address space. */
 	if (map->root == NULL)
 		return (start);
 
 	/*
 	 * After splay_split, if start is within an entry, push it to the start
 	 * of the following gap.  If rlist is at the end of the gap containing
 	 * start, save the end of that gap in gap_end to see if the gap is big
 	 * enough; otherwise set gap_end to start skip gap-checking and move
 	 * directly to a search of the right subtree.
 	 */
 	header = &map->header;
 	root = vm_map_splay_split(map, start, length, &llist, &rlist);
 	gap_end = rlist->start;
 	if (root != NULL) {
 		start = root->end;
 		if (root->right != rlist)
 			gap_end = start;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else if (rlist != header) {
 		root = rlist;
 		rlist = root->left;
 		max_free_left = vm_map_splay_merge_pred(header, root, llist);
 		max_free_right = vm_map_splay_merge_right(header, root, rlist);
 	} else {
 		root = llist;
 		llist = root->right;
 		max_free_left = vm_map_splay_merge_left(header, root, llist);
 		max_free_right = vm_map_splay_merge_succ(header, root, rlist);
 	}
 	root->max_free = vm_size_max(max_free_left, max_free_right);
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	if (length <= gap_end - start)
 		return (start);
 
 	/* With max_free, can immediately tell if no solution. */
 	if (root->right == header || length > root->right->max_free)
 		return (vm_map_max(map) - length + 1);
 
 	/*
 	 * Splay for the least large-enough gap in the right subtree.
 	 */
 	llist = rlist = header;
 	for (left_length = 0;;
 	    left_length = vm_map_entry_max_free_left(root, llist)) {
 		if (length <= left_length)
 			SPLAY_LEFT_STEP(root, y, llist, rlist,
 			    length <= vm_map_entry_max_free_left(y, llist));
 		else
 			SPLAY_RIGHT_STEP(root, y, llist, rlist,
 			    length > vm_map_entry_max_free_left(y, root));
 		if (root == NULL)
 			break;
 	}
 	root = llist;
 	llist = root->right;
 	max_free_left = vm_map_splay_merge_left(header, root, llist);
 	if (rlist == header) {
 		root->max_free = vm_size_max(max_free_left,
 		    vm_map_splay_merge_succ(header, root, rlist));
 	} else {
 		y = rlist;
 		rlist = y->left;
 		y->max_free = vm_size_max(
 		    vm_map_splay_merge_pred(root, y, root),
 		    vm_map_splay_merge_right(header, y, rlist));
 		root->max_free = vm_size_max(max_free_left, y->max_free);
 	}
 	map->root = root;
 	VM_MAP_ASSERT_CONSISTENT(map);
 	return (root->end);
 }
 
 int
 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_size_t length, vm_prot_t prot,
     vm_prot_t max, int cow)
 {
 	vm_offset_t end;
 	int result;
 
 	end = start + length;
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_fixed: non-NULL backing object for stack"));
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if ((cow & MAP_CHECK_EXCL) == 0) {
 		result = vm_map_delete(map, start, end);
 		if (result != KERN_SUCCESS)
 			goto out;
 	}
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		result = vm_map_stack_locked(map, start, length, sgrowsiz,
 		    prot, max, cow);
 	} else {
 		result = vm_map_insert(map, object, offset, start, end,
 		    prot, max, cow);
 	}
 out:
 	vm_map_unlock(map);
 	return (result);
 }
 
 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
 
 static int cluster_anon = 1;
 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
     &cluster_anon, 0,
     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
 
 static bool
 clustering_anon_allowed(vm_offset_t addr)
 {
 
 	switch (cluster_anon) {
 	case 0:
 		return (false);
 	case 1:
 		return (addr == 0);
 	case 2:
 	default:
 		return (true);
 	}
 }
 
 static long aslr_restarts;
 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
     &aslr_restarts, 0,
     "Number of aslr failures");
 
 /*
  * Searches for the specified amount of free space in the given map with the
  * specified alignment.  Performs an address-ordered, first-fit search from
  * the given address "*addr", with an optional upper bound "max_addr".  If the
  * parameter "alignment" is zero, then the alignment is computed from the
  * given (object, offset) pair so as to enable the greatest possible use of
  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
  *
  * The map must be locked.  Initially, there must be at least "length" bytes
  * of free space at the given address.
  */
 static int
 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
     vm_offset_t alignment)
 {
 	vm_offset_t aligned_addr, free_addr;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	free_addr = *addr;
 	KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
 	    ("caller failed to provide space %#jx at address %p",
 	     (uintmax_t)length, (void *)free_addr));
 	for (;;) {
 		/*
 		 * At the start of every iteration, the free space at address
 		 * "*addr" is at least "length" bytes.
 		 */
 		if (alignment == 0)
 			pmap_align_superpage(object, offset, addr, length);
 		else if ((*addr & (alignment - 1)) != 0) {
 			*addr &= ~(alignment - 1);
 			*addr += alignment;
 		}
 		aligned_addr = *addr;
 		if (aligned_addr == free_addr) {
 			/*
 			 * Alignment did not change "*addr", so "*addr" must
 			 * still provide sufficient free space.
 			 */
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * Test for address wrap on "*addr".  A wrapped "*addr" could
 		 * be a valid address, in which case vm_map_findspace() cannot
 		 * be relied upon to fail.
 		 */
 		if (aligned_addr < free_addr)
 			return (KERN_NO_SPACE);
 		*addr = vm_map_findspace(map, aligned_addr, length);
 		if (*addr + length > vm_map_max(map) ||
 		    (max_addr != 0 && *addr + length > max_addr))
 			return (KERN_NO_SPACE);
 		free_addr = *addr;
 		if (free_addr == aligned_addr) {
 			/*
 			 * If a successful call to vm_map_findspace() did not
 			 * change "*addr", then "*addr" must still be aligned
 			 * and provide sufficient free space.
 			 */
 			return (KERN_SUCCESS);
 		}
 	}
 }
 
 int
 vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
     vm_offset_t max_addr, vm_offset_t alignment)
 {
 	/* XXXKIB ASLR eh ? */
 	*addr = vm_map_findspace(map, *addr, length);
 	if (*addr + length > vm_map_max(map) ||
 	    (max_addr != 0 && *addr + length > max_addr))
 		return (KERN_NO_SPACE);
 	return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
 	    alignment));
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, vm_offset_t max_addr, int find_space,
 	    vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_offset_t alignment, curr_min_addr, min_addr;
 	int gap, pidx, rv, try;
 	bool cluster, en_aslr, update_anon;
 
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_find: non-NULL backing object for stack"));
 	MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
 	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
 	    (object->flags & OBJ_COLORED) == 0))
 		find_space = VMFS_ANY_SPACE;
 	if (find_space >> 8 != 0) {
 		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
 		alignment = (vm_offset_t)1 << (find_space >> 8);
 	} else
 		alignment = 0;
 	en_aslr = (map->flags & MAP_ASLR) != 0;
 	update_anon = cluster = clustering_anon_allowed(*addr) &&
 	    (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
 	    find_space != VMFS_NO_SPACE && object == NULL &&
 	    (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
 	    MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
 	curr_min_addr = min_addr = *addr;
 	if (en_aslr && min_addr == 0 && !cluster &&
 	    find_space != VMFS_NO_SPACE &&
 	    (map->flags & MAP_ASLR_IGNSTART) != 0)
 		curr_min_addr = min_addr = vm_map_min(map);
 	try = 0;
 	vm_map_lock(map);
 	if (cluster) {
 		curr_min_addr = map->anon_loc;
 		if (curr_min_addr == 0)
 			cluster = false;
 	}
 	if (find_space != VMFS_NO_SPACE) {
 		KASSERT(find_space == VMFS_ANY_SPACE ||
 		    find_space == VMFS_OPTIMAL_SPACE ||
 		    find_space == VMFS_SUPER_SPACE ||
 		    alignment != 0, ("unexpected VMFS flag"));
 again:
 		/*
 		 * When creating an anonymous mapping, try clustering
 		 * with an existing anonymous mapping first.
 		 *
 		 * We make up to two attempts to find address space
 		 * for a given find_space value. The first attempt may
 		 * apply randomization or may cluster with an existing
 		 * anonymous mapping. If this first attempt fails,
 		 * perform a first-fit search of the available address
 		 * space.
 		 *
 		 * If all tries failed, and find_space is
 		 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
 		 * Again enable clustering and randomization.
 		 */
 		try++;
 		MPASS(try <= 2);
 
 		if (try == 2) {
 			/*
 			 * Second try: we failed either to find a
 			 * suitable region for randomizing the
 			 * allocation, or to cluster with an existing
 			 * mapping.  Retry with free run.
 			 */
 			curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
 			    vm_map_min(map) : min_addr;
 			atomic_add_long(&aslr_restarts, 1);
 		}
 
 		if (try == 1 && en_aslr && !cluster) {
 			/*
 			 * Find space for allocation, including
 			 * gap needed for later randomization.
 			 */
 			pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
 			    (find_space == VMFS_SUPER_SPACE || find_space ==
 			    VMFS_OPTIMAL_SPACE) ? 1 : 0;
 			gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
 			    (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
 			    aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
 			*addr = vm_map_findspace(map, curr_min_addr,
 			    length + gap * pagesizes[pidx]);
 			if (*addr + length + gap * pagesizes[pidx] >
 			    vm_map_max(map))
 				goto again;
 			/* And randomize the start address. */
 			*addr += (arc4random() % gap) * pagesizes[pidx];
 			if (max_addr != 0 && *addr + length > max_addr)
 				goto again;
 		} else {
 			*addr = vm_map_findspace(map, curr_min_addr, length);
 			if (*addr + length > vm_map_max(map) ||
 			    (max_addr != 0 && *addr + length > max_addr)) {
 				if (cluster) {
 					cluster = false;
 					MPASS(try == 1);
 					goto again;
 				}
 				rv = KERN_NO_SPACE;
 				goto done;
 			}
 		}
 
 		if (find_space != VMFS_ANY_SPACE &&
 		    (rv = vm_map_alignspace(map, object, offset, addr, length,
 		    max_addr, alignment)) != KERN_SUCCESS) {
 			if (find_space == VMFS_OPTIMAL_SPACE) {
 				find_space = VMFS_ANY_SPACE;
 				curr_min_addr = min_addr;
 				cluster = update_anon;
 				try = 0;
 				goto again;
 			}
 			goto done;
 		}
 	} else if ((cow & MAP_REMAP) != 0) {
 		if (!vm_map_range_valid(map, *addr, *addr + length)) {
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		rv = vm_map_delete(map, *addr, *addr + length);
 		if (rv != KERN_SUCCESS)
 			goto done;
 	}
 	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 		rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
 		    max, cow);
 	} else {
 		rv = vm_map_insert(map, object, offset, *addr, *addr + length,
 		    prot, max, cow);
 	}
 	if (rv == KERN_SUCCESS && update_anon)
 		map->anon_loc = *addr + length;
 done:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_find_min() is a variant of vm_map_find() that takes an
  *	additional parameter (min_addr) and treats the given address
  *	(*addr) differently.  Specifically, it treats *addr as a hint
  *	and not as the minimum address where the mapping is created.
  *
  *	This function works in two phases.  First, it tries to
  *	allocate above the hint.  If that fails and the hint is
  *	greater than min_addr, it performs a second pass, replacing
  *	the hint with min_addr as the minimum address for the
  *	allocation.
  */
 int
 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
     int cow)
 {
 	vm_offset_t hint;
 	int rv;
 
 	hint = *addr;
 	for (;;) {
 		rv = vm_map_find(map, object, offset, addr, length, max_addr,
 		    find_space, prot, max, cow);
 		if (rv == KERN_SUCCESS || min_addr >= hint)
 			return (rv);
 		*addr = hint = min_addr;
 	}
 }
 
 /*
  * A map entry with any of the following flags set must not be merged with
  * another entry.
  */
 #define	MAP_ENTRY_NOMERGE_MASK	(MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
 	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC)
 
 static bool
 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
 {
 
 	KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
 	    (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
 	    ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
 	    prev, entry));
 	return (prev->end == entry->start &&
 	    prev->object.vm_object == entry->object.vm_object &&
 	    (prev->object.vm_object == NULL ||
 	    prev->offset + (prev->end - prev->start) == entry->offset) &&
 	    prev->eflags == entry->eflags &&
 	    prev->protection == entry->protection &&
 	    prev->max_protection == entry->max_protection &&
 	    prev->inheritance == entry->inheritance &&
 	    prev->wired_count == entry->wired_count &&
 	    prev->cred == entry->cred);
 }
 
 static void
 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * If the backing object is a vnode object, vm_object_deallocate()
 	 * calls vrele().  However, vrele() does not lock the vnode because
 	 * the vnode has additional references.  Thus, the map lock can be
 	 * kept without causing a lock-order reversal with the vnode lock.
 	 *
 	 * Since we count the number of virtual page mappings in
 	 * object->un_pager.vnp.writemappings, the writemappings value
 	 * should not be adjusted when the entry is disposed of.
 	 */
 	if (entry->object.vm_object != NULL)
 		vm_object_deallocate(entry->object.vm_object);
 	if (entry->cred != NULL)
 		crfree(entry->cred);
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_try_merge_entries:
  *
  *	Compare the given map entry to its predecessor, and merge its precessor
  *	into it if possible.  The entry remains valid, and may be extended.
  *	The predecessor may be deleted.
  *
  *	The map must be locked.
  */
 void
 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
     vm_map_entry_t entry)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
 	    vm_map_mergeable_neighbors(prev_entry, entry)) {
 		vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
 		vm_map_merged_neighbor_dispose(map, prev_entry);
 	}
 }
 
 /*
  *	vm_map_entry_back:
  *
  *	Allocate an object to back a map entry.
  */
 static inline void
 vm_map_entry_back(vm_map_entry_t entry)
 {
 	vm_object_t object;
 
 	KASSERT(entry->object.vm_object == NULL,
 	    ("map entry %p has backing object", entry));
 	KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 	    ("map entry %p is a submap", entry));
 	object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
 	    entry->cred, entry->end - entry->start);
 	entry->object.vm_object = object;
 	entry->offset = 0;
 	entry->cred = NULL;
 }
 
 /*
  *	vm_map_entry_charge_object
  *
  *	If there is no object backing this entry, create one.  Otherwise, if
  *	the entry has cred, give it to the backing object.
  */
 static inline void
 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 	    ("map entry %p is a submap", entry));
 	if (entry->object.vm_object == NULL && !map->system_map &&
 	    (entry->eflags & MAP_ENTRY_GUARD) == 0)
 		vm_map_entry_back(entry);
 	else if (entry->object.vm_object != NULL &&
 	    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 	    entry->cred != NULL) {
 		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
 		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 }
 
 /*
  *	vm_map_entry_clone
  *
  *	Create a duplicate map entry for clipping.
  */
 static vm_map_entry_t
 vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	/*
 	 * Create a backing object now, if none exists, so that more individual
 	 * objects won't be created after the map entry is split.
 	 */
 	vm_map_entry_charge_object(map, entry);
 
 	/* Clone the entry. */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 	if (new_entry->cred != NULL)
 		crhold(entry->cred);
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 		vm_map_entry_set_vnode_text(new_entry, true);
 		/*
 		 * The object->un_pager.vnp.writemappings for the object of
 		 * MAP_ENTRY_WRITECNT type entry shall be kept as is here.  The
 		 * virtual pages are re-distributed among the clipped entries,
 		 * so the sum is left the same.
 		 */
 	}
 	return (new_entry);
 }
 
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 static int
 vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
 {
 	vm_map_entry_t new_entry;
 	int bdry_idx;
 
 	if (!map->system_map)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "%s: map %p entry %p start 0x%jx", __func__, map, entry,
 		    (uintmax_t)startaddr);
 
 	if (startaddr <= entry->start)
 		return (KERN_SUCCESS);
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->end > startaddr && entry->start < startaddr,
 	    ("%s: invalid clip of entry %p", __func__, entry));
 
 	bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
 	    MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
 	if (bdry_idx != 0) {
 		if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
 			return (KERN_INVALID_ARGUMENT);
 	}
 
 	new_entry = vm_map_entry_clone(map, entry);
 
 	/*
 	 * Split off the front portion.  Insert the new entry BEFORE this one,
 	 * so that this entry has the specified starting address.
 	 */
 	new_entry->end = startaddr;
 	vm_map_entry_link(map, new_entry);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_clip_start:
  *
  *	Find the entry at or just after 'start', and clip it if 'start' is in
  *	the interior of the entry.  Return entry after 'start', and in
  *	prev_entry set the entry before 'start'.
  */
 static int
 vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
     vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
 {
 	vm_map_entry_t entry;
 	int rv;
 
 	if (!map->system_map)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "%s: map %p start 0x%jx prev %p", __func__, map,
 		    (uintmax_t)start, prev_entry);
 
 	if (vm_map_lookup_entry(map, start, prev_entry)) {
 		entry = *prev_entry;
 		rv = vm_map_clip_start(map, entry, start);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		*prev_entry = vm_map_entry_pred(entry);
 	} else
 		entry = vm_map_entry_succ(*prev_entry);
 	*res_entry = entry;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 static int
 vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
 {
 	vm_map_entry_t new_entry;
 	int bdry_idx;
 
 	if (!map->system_map)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "%s: map %p entry %p end 0x%jx", __func__, map, entry,
 		    (uintmax_t)endaddr);
 
 	if (endaddr >= entry->end)
 		return (KERN_SUCCESS);
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->start < endaddr && entry->end > endaddr,
 	    ("%s: invalid clip of entry %p", __func__, entry));
 
 	bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
 	    MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
 	if (bdry_idx != 0) {
 		if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
 			return (KERN_INVALID_ARGUMENT);
 	}
 
 	new_entry = vm_map_entry_clone(map, entry);
 
 	/*
 	 * Split off the back portion.  Insert the new entry AFTER this one,
 	 * so that this entry has the specified ending address.
 	 */
 	new_entry->start = endaddr;
 	vm_map_entry_link(map, new_entry);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result;
 
 	result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(submap);
 	submap->flags |= MAP_IS_SUB_MAP;
 	vm_map_unlock(submap);
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0 &&
 	    entry->object.vm_object == NULL) {
 		result = vm_map_clip_start(map, entry, start);
 		if (result != KERN_SUCCESS)
 			goto unlock;
 		result = vm_map_clip_end(map, entry, end);
 		if (result != KERN_SUCCESS)
 			goto unlock;
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 unlock:
 	vm_map_unlock(map);
 
 	if (result != KERN_SUCCESS) {
 		vm_map_lock(submap);
 		submap->flags &= ~MAP_IS_SUB_MAP;
 		vm_map_unlock(submap);
 	}
 	return (result);
 }
 
 /*
  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload the specified map's pmap with mappings to the specified
  *	object's memory-resident pages.  No further physical pages are
  *	allocated, and no further virtual pages are retrieved from secondary
  *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
  *	limited number of page mappings are created at the low-end of the
  *	specified address range.  (For this purpose, a superpage mapping
  *	counts as one page mapping.)  Otherwise, all resident pages within
  *	the specified address range are mapped.
  */
 static void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t mask, psize, threshold, tmpidx;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
 			pmap_object_init_pt(map->pmap, addr, object, pindex,
 			    size);
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 		VM_OBJECT_LOCK_DOWNGRADE(object);
 	} else
 		VM_OBJECT_RLOCK(object);
 
 	psize = atop(size);
 	if (psize + pindex > object->size) {
 		if (pindex >= object->size) {
 			VM_OBJECT_RUNLOCK(object);
 			return;
 		}
 		psize = object->size - pindex;
 	}
 
 	start = 0;
 	p_start = NULL;
 	threshold = MAX_INIT_PT;
 
 	p = vm_page_find_least(object, pindex);
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
 		    vm_page_count_severe()) ||
 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
 		    tmpidx >= threshold)) {
 			psize = tmpidx;
 			break;
 		}
 		if (vm_page_all_valid(p)) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 			/* Jump ahead if a superpage mapping is possible. */
 			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
 			    (pagesizes[p->psind] - 1)) == 0) {
 				mask = atop(pagesizes[p->psind]) - 1;
 				if (tmpidx + mask < psize &&
 				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
 					p += mask;
 					threshold += mask;
 				}
 			}
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 	VM_OBJECT_RUNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t entry, first_entry, in_tran, prev_entry;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
 	int rv;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 
 again:
 	in_tran = NULL;
 	vm_map_lock(map);
 
 	/*
 	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
 	 * need to fault pages into the map and will drop the map lock while
 	 * doing so, and the VM object may end up in an inconsistent state if we
 	 * update the protection on the map entry in between faults.
 	 */
 	vm_map_wait_busy(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		first_entry = vm_map_entry_succ(first_entry);
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	for (entry = first_entry; entry->start < end;
 	    entry = vm_map_entry_succ(entry)) {
 		if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & entry->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
 			in_tran = entry;
 	}
 
 	/*
 	 * Postpone the operation until all in-transition map entries have
 	 * stabilized.  An in-transition entry might already have its pages
 	 * wired and wired_count incremented, but not yet have its
 	 * MAP_ENTRY_USER_WIRED flag set.  In which case, we would fail to call
 	 * vm_fault_copy_entry() in the final loop below.
 	 */
 	if (in_tran != NULL) {
 		in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 		vm_map_unlock_and_wait(map, 0);
 		goto again;
 	}
 
 	/*
 	 * Before changing the protections, try to reserve swap space for any
 	 * private (i.e., copy-on-write) mappings that are transitioning from
 	 * read-only to read/write access.  If a reservation fails, break out
 	 * of this loop early and let the next loop simplify the entries, since
 	 * some may now be mergeable.
 	 */
 	rv = vm_map_clip_start(map, first_entry, start);
 	if (rv != KERN_SUCCESS) {
 		vm_map_unlock(map);
 		return (rv);
 	}
 	for (entry = first_entry; entry->start < end;
 	    entry = vm_map_entry_succ(entry)) {
 		rv = vm_map_clip_end(map, entry, end);
 		if (rv != KERN_SUCCESS) {
 			vm_map_unlock(map);
 			return (rv);
 		}
 
 		if (set_max ||
 		    ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
 		    ENTRY_CHARGED(entry) ||
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0) {
 			continue;
 		}
 
 		cred = curthread->td_ucred;
 		obj = entry->object.vm_object;
 
 		if (obj == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
 			if (!swap_reserve(entry->end - entry->start)) {
 				rv = KERN_RESOURCE_SHORTAGE;
 				end = entry->end;
 				break;
 			}
 			crhold(cred);
 			entry->cred = cred;
 			continue;
 		}
 
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP)
 			continue;
 		VM_OBJECT_WLOCK(obj);
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
 			VM_OBJECT_WUNLOCK(obj);
 			continue;
 		}
 
 		/*
 		 * Charge for the whole object allocation now, since
 		 * we cannot distinguish between non-charged and
 		 * charged clipped mapping of the same object later.
 		 */
 		KASSERT(obj->charge == 0,
 		    ("vm_map_protect: object %p overcharged (entry %p)",
 		    obj, entry));
 		if (!swap_reserve(ptoa(obj->size))) {
 			VM_OBJECT_WUNLOCK(obj);
 			rv = KERN_RESOURCE_SHORTAGE;
 			end = entry->end;
 			break;
 		}
 
 		crhold(cred);
 		obj->cred = cred;
 		obj->charge = ptoa(obj->size);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 
 	/*
 	 * If enough swap space was available, go back and fix up protections.
 	 * Otherwise, just simplify entries, since some may have been modified.
 	 * [Note that clipping is not necessary the second time.]
 	 */
 	for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
 	    entry->start < end;
 	    vm_map_try_merge_entries(map, prev_entry, entry),
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		if (rv != KERN_SUCCESS ||
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 
 		old_prot = entry->protection;
 
 		if (set_max)
 			entry->protection =
 			    (entry->max_protection = new_prot) &
 			    old_prot;
 		else
 			entry->protection = new_prot;
 
 		/*
 		 * For user wired map entries, the normal lazy evaluation of
 		 * write access upgrades through soft page faults is
 		 * undesirable.  Instead, immediately copy any pages that are
 		 * copy-on-write and enable write access in the physical map.
 		 */
 		if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
 		    (entry->protection & VM_PROT_WRITE) != 0 &&
 		    (old_prot & VM_PROT_WRITE) == 0)
 			vm_fault_copy_entry(map, map, entry, entry, NULL);
 
 		/*
 		 * When restricting access, update the physical map.  Worry
 		 * about copy-on-write here.
 		 */
 		if ((old_prot & ~entry->protection) != 0) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, entry->start,
 			    entry->end,
 			    entry->protection & MASK(entry));
 #undef	MASK
 		}
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t entry, prev_entry;
 	int rv;
 	bool modify_map;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		if (start == end)
 			return (0);
 		modify_map = true;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		if (start == end)
 			return (0);
 		modify_map = false;
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
 		if (rv != KERN_SUCCESS) {
 			vm_map_unlock(map);
 			return (vm_mmap_to_errno(rv));
 		}
 
 		for (; entry->start < end; prev_entry = entry,
 		    entry = vm_map_entry_succ(entry)) {
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 				continue;
 
 			rv = vm_map_clip_end(map, entry, end);
 			if (rv != KERN_SUCCESS) {
 				vm_map_unlock(map);
 				return (vm_mmap_to_errno(rv));
 			}
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(entry,
 				    MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(entry,
 				    MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(entry,
 				    MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				entry->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				entry->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				entry->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_try_merge_entries(map, prev_entry, entry);
 		}
 		vm_map_try_merge_entries(map, prev_entry, entry);
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pstart, pend;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		if (!vm_map_lookup_entry(map, start, &entry))
 			entry = vm_map_entry_succ(entry);
 		for (; entry->start < end;
 		    entry = vm_map_entry_succ(entry)) {
 			vm_offset_t useEnd, useStart;
 
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 				continue;
 
 			/*
 			 * MADV_FREE would otherwise rewind time to
 			 * the creation of the shadow object.  Because
 			 * we hold the VM map read-locked, neither the
 			 * entry's object nor the presence of a
 			 * backing object can change.
 			 */
 			if (behav == MADV_FREE &&
 			    entry->object.vm_object != NULL &&
 			    entry->object.vm_object->backing_object != NULL)
 				continue;
 
 			pstart = OFF_TO_IDX(entry->offset);
 			pend = pstart + atop(entry->end - entry->start);
 			useStart = entry->start;
 			useEnd = entry->end;
 
 			if (entry->start < start) {
 				pstart += atop(start - entry->start);
 				useStart = start;
 			}
 			if (entry->end > end) {
 				pend -= atop(entry->end - end);
 				useEnd = end;
 			}
 
 			if (pstart >= pend)
 				continue;
 
 			/*
 			 * Perform the pmap_advise() before clearing
 			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
 			 * concurrent pmap operation, such as pmap_remove(),
 			 * could clear a reference in the pmap and set
 			 * PGA_REFERENCED on the page before the pmap_advise()
 			 * had completed.  Consequently, the page would appear
 			 * referenced based upon an old reference that
 			 * occurred before this pmap_advise() ran.
 			 */
 			if (behav == MADV_DONTNEED || behav == MADV_FREE)
 				pmap_advise(map->pmap, useStart, useEnd,
 				    behav);
 
 			vm_object_madvise(entry->object.vm_object, pstart,
 			    pend, behav);
 
 			/*
 			 * Pre-populate paging structures in the
 			 * WILLNEED case.  For wired entries, the
 			 * paging structures are already populated.
 			 */
 			if (behav == MADV_WILLNEED &&
 			    entry->wired_count == 0) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    entry->protection,
 				    entry->object.vm_object,
 				    pstart,
 				    ptoa(pend - pstart),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vmspace_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry, lentry, prev_entry, start_entry;
 	int rv;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 	case VM_INHERIT_ZERO:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	if (start == end)
 		return (KERN_SUCCESS);
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
 	if (rv != KERN_SUCCESS)
 		goto unlock;
 	if (vm_map_lookup_entry(map, end - 1, &lentry)) {
 		rv = vm_map_clip_end(map, lentry, end);
 		if (rv != KERN_SUCCESS)
 			goto unlock;
 	}
 	if (new_inheritance == VM_INHERIT_COPY) {
 		for (entry = start_entry; entry->start < end;
 		    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 			if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
 			    != 0) {
 				rv = KERN_INVALID_ARGUMENT;
 				goto unlock;
 			}
 		}
 	}
 	for (entry = start_entry; entry->start < end; prev_entry = entry,
 	    entry = vm_map_entry_succ(entry)) {
 		KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
 		    entry, (uintmax_t)entry->end, (uintmax_t)end));
 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
 		    new_inheritance != VM_INHERIT_ZERO)
 			entry->inheritance = new_inheritance;
 		vm_map_try_merge_entries(map, prev_entry, entry);
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 unlock:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_entry_in_transition:
  *
  *	Release the map lock, and sleep until the entry is no longer in
  *	transition.  Awake and acquire the map lock.  If the map changed while
  *	another held the lock, lookup a possibly-changed entry at or after the
  *	'start' position of the old entry.
  */
 static vm_map_entry_t
 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
     vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
 {
 	vm_map_entry_t entry;
 	vm_offset_t start;
 	u_int last_timestamp;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 	    ("not in-tranition map entry %p", in_entry));
 	/*
 	 * We have not yet clipped the entry.
 	 */
 	start = MAX(in_start, in_entry->start);
 	in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 	last_timestamp = map->timestamp;
 	if (vm_map_unlock_and_wait(map, 0)) {
 		/*
 		 * Allow interruption of user wiring/unwiring?
 		 */
 	}
 	vm_map_lock(map);
 	if (last_timestamp + 1 == map->timestamp)
 		return (in_entry);
 
 	/*
 	 * Look again for the entry because the map was modified while it was
 	 * unlocked.  Specifically, the entry may have been clipped, merged, or
 	 * deleted.
 	 */
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		if (!holes_ok) {
 			*io_end = start;
 			return (NULL);
 		}
 		entry = vm_map_entry_succ(entry);
 	}
 	return (entry);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, next_entry, prev_entry;
 	int rv;
 	bool holes_ok, need_wakeup, user_unwire;
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
 	user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (holes_ok)
 			first_entry = vm_map_entry_succ(first_entry);
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	rv = KERN_SUCCESS;
 	for (entry = first_entry; entry->start < end; entry = next_entry) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			next_entry = vm_map_entry_in_transition(map, start,
 			    &end, holes_ok, entry);
 			if (next_entry == NULL) {
 				if (entry == first_entry) {
 					vm_map_unlock(map);
 					return (KERN_INVALID_ADDRESS);
 				}
 				rv = KERN_INVALID_ADDRESS;
 				break;
 			}
 			first_entry = (entry == first_entry) ?
 			    next_entry : NULL;
 			continue;
 		}
 		rv = vm_map_clip_start(map, entry, start);
 		if (rv != KERN_SUCCESS)
 			break;
 		rv = vm_map_clip_end(map, entry, end);
 		if (rv != KERN_SUCCESS)
 			break;
 
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		next_entry = vm_map_entry_succ(entry);
 		/*
 		 * Check the map for holes in the specified region.
 		 * If holes_ok, skip this check.
 		 */
 		if (!holes_ok &&
 		    entry->end < end && next_entry->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			break;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			break;
 		}
 	}
 	need_wakeup = false;
 	if (first_entry == NULL &&
 	    !vm_map_lookup_entry(map, start, &first_entry)) {
 		KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
 		prev_entry = first_entry;
 		entry = vm_map_entry_succ(first_entry);
 	} else {
 		prev_entry = vm_map_entry_pred(first_entry);
 		entry = first_entry;
 	}
 	for (; entry->start < end;
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		/*
 		 * If holes_ok was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for draining
 		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
 		 * could be simultaneously wiring this new mapping
 		 * entry.  Detect these cases and skip any entries
 		 * marked as in transition by us.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT(holes_ok,
 			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (entry->wired_count == 1)
 				vm_map_entry_unwire(map, entry);
 			else
 				entry->wired_count--;
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 		}
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_unwire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_unwire: alien wire %p", entry));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = true;
 		}
 		vm_map_try_merge_entries(map, prev_entry, entry);
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 static void
 vm_map_wire_user_count_sub(u_long npages)
 {
 
 	atomic_subtract_long(&vm_user_wire_count, npages);
 }
 
 static bool
 vm_map_wire_user_count_add(u_long npages)
 {
 	u_long wired;
 
 	wired = vm_user_wire_count;
 	do {
 		if (npages + wired > vm_page_max_user_wired)
 			return (false);
 	} while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
 	    npages + wired));
 
 	return (true);
 }
 
 /*
  *	vm_map_wire_entry_failure:
  *
  *	Handle a wiring failure on the given entry.
  *
  *	The map should be locked.
  */
 static void
 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
     vm_offset_t failed_addr)
 {
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
 	    entry->wired_count == 1,
 	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
 	KASSERT(failed_addr < entry->end,
 	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
 
 	/*
 	 * If any pages at the start of this entry were successfully wired,
 	 * then unwire them.
 	 */
 	if (failed_addr > entry->start) {
 		pmap_unwire(map->pmap, entry->start, failed_addr);
 		vm_object_unwire(entry->object.vm_object, entry->offset,
 		    failed_addr - entry->start, PQ_ACTIVE);
 	}
 
 	/*
 	 * Assign an out-of-range value to represent the failure to wire this
 	 * entry.
 	 */
 	entry->wired_count = -1;
 }
 
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
 {
 	int rv;
 
 	vm_map_lock(map);
 	rv = vm_map_wire_locked(map, start, end, flags);
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /*
  *	vm_map_wire_locked:
  *
  *	Implements both kernel and user wiring.  Returns with the map locked,
  *	the map lock may be dropped.
  */
 int
 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
 {
 	vm_map_entry_t entry, first_entry, next_entry, prev_entry;
 	vm_offset_t faddr, saved_end, saved_start;
 	u_long incr, npages;
 	u_int bidx, last_timestamp;
 	int rv;
 	bool holes_ok, need_wakeup, user_wire;
 	vm_prot_t prot;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	if (start == end)
 		return (KERN_SUCCESS);
 	prot = 0;
 	if (flags & VM_MAP_WIRE_WRITE)
 		prot |= VM_PROT_WRITE;
 	holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
 	user_wire = (flags & VM_MAP_WIRE_USER) != 0;
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (holes_ok)
 			first_entry = vm_map_entry_succ(first_entry);
 		else
 			return (KERN_INVALID_ADDRESS);
 	}
 	for (entry = first_entry; entry->start < end; entry = next_entry) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			next_entry = vm_map_entry_in_transition(map, start,
 			    &end, holes_ok, entry);
 			if (next_entry == NULL) {
 				if (entry == first_entry)
 					return (KERN_INVALID_ADDRESS);
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 			first_entry = (entry == first_entry) ?
 			    next_entry : NULL;
 			continue;
 		}
 		rv = vm_map_clip_start(map, entry, start);
 		if (rv != KERN_SUCCESS)
 			goto done;
 		rv = vm_map_clip_end(map, entry, end);
 		if (rv != KERN_SUCCESS)
 			goto done;
 
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
 		    entry->wiring_thread == NULL,
 		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		entry->wiring_thread = curthread;
 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
 		    || (entry->protection & prot) != prot) {
 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
 			if (!holes_ok) {
 				end = entry->end;
 				rv = KERN_INVALID_ADDRESS;
 				goto done;
 			}
 		} else if (entry->wired_count == 0) {
 			entry->wired_count++;
 
 			npages = atop(entry->end - entry->start);
 			if (user_wire && !vm_map_wire_user_count_add(npages)) {
 				vm_map_wire_entry_failure(map, entry,
 				    entry->start);
 				end = entry->end;
 				rv = KERN_RESOURCE_SHORTAGE;
 				goto done;
 			}
 
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.  Mark the map busy for fork.
 			 */
 			saved_start = entry->start;
 			saved_end = entry->end;
 			last_timestamp = map->timestamp;
 			bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
 			    >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
 			incr =  pagesizes[bidx];
 			vm_map_busy(map);
 			vm_map_unlock(map);
 
 			for (faddr = saved_start; faddr < saved_end;
 			    faddr += incr) {
 				/*
 				 * Simulate a fault to get the page and enter
 				 * it into the physical map.
 				 */
 				rv = vm_fault(map, faddr, VM_PROT_NONE,
 				    VM_FAULT_WIRE, NULL);
 				if (rv != KERN_SUCCESS)
 					break;
 			}
 			vm_map_lock(map);
 			vm_map_unbusy(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &next_entry))
 					KASSERT(false,
 					    ("vm_map_wire: lookup failed"));
 				first_entry = (entry == first_entry) ?
 				    next_entry : NULL;
 				for (entry = next_entry; entry->end < saved_end;
 				    entry = vm_map_entry_succ(entry)) {
 					/*
 					 * In case of failure, handle entries
 					 * that were not fully wired here;
 					 * fully wired entries are handled
 					 * later.
 					 */
 					if (rv != KERN_SUCCESS &&
 					    faddr < entry->end)
 						vm_map_wire_entry_failure(map,
 						    entry, faddr);
 				}
 			}
 			if (rv != KERN_SUCCESS) {
 				vm_map_wire_entry_failure(map, entry, faddr);
 				if (user_wire)
 					vm_map_wire_user_count_sub(npages);
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If holes_ok was specified, skip this check.
 		 */
 		next_entry = vm_map_entry_succ(entry);
 		if (!holes_ok &&
 		    entry->end < end && next_entry->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = false;
 	if (first_entry == NULL &&
 	    !vm_map_lookup_entry(map, start, &first_entry)) {
 		KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
 		prev_entry = first_entry;
 		entry = vm_map_entry_succ(first_entry);
 	} else {
 		prev_entry = vm_map_entry_pred(first_entry);
 		entry = first_entry;
 	}
 	for (; entry->start < end;
 	    prev_entry = entry, entry = vm_map_entry_succ(entry)) {
 		/*
 		 * If holes_ok was specified, an empty
 		 * space in the unwired region could have been mapped
 		 * while the map lock was dropped for faulting in the
 		 * pages or draining MAP_ENTRY_IN_TRANSITION.
 		 * Moreover, another thread could be simultaneously
 		 * wiring this new mapping entry.  Detect these cases
 		 * and skip any entries marked as in transition not by us.
 		 *
 		 * Another way to get an entry not marked with
 		 * MAP_ENTRY_IN_TRANSITION is after failed clipping,
 		 * which set rv to KERN_INVALID_ARGUMENT.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
 		    entry->wiring_thread != curthread) {
 			KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
 			    ("vm_map_wire: !HOLESOK and new/changed entry"));
 			continue;
 		}
 
 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
 			/* do nothing */
 		} else if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else if (!user_wire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			/*
 			 * Undo the wiring.  Wiring succeeded on this entry
 			 * but failed on a later entry.  
 			 */
 			if (entry->wired_count == 1) {
 				vm_map_entry_unwire(map, entry);
 				if (user_wire)
 					vm_map_wire_user_count_sub(
 					    atop(entry->end - entry->start));
 			} else
 				entry->wired_count--;
 		}
 		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
 		    ("vm_map_wire: in-transition flag missing %p", entry));
 		KASSERT(entry->wiring_thread == curthread,
 		    ("vm_map_wire: alien wire %p", entry));
 		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
 		    MAP_ENTRY_WIRE_SKIPPED);
 		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = true;
 		}
 		vm_map_try_merge_entries(map, prev_entry, entry);
 	}
 	vm_map_try_merge_entries(map, prev_entry, entry);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t entry, first_entry, next_entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	unsigned int last_timestamp;
 	int bdry_idx;
 	boolean_t failed;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = first_entry->start;
 		end = first_entry->end;
 	}
 
 	/*
 	 * Make a first pass to check for user-wired memory, holes,
 	 * and partial invalidation of largepage mappings.
 	 */
 	for (entry = first_entry; entry->start < end; entry = next_entry) {
 		if (invalidate) {
 			if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
 				vm_map_unlock_read(map);
 				return (KERN_INVALID_ARGUMENT);
 			}
 			bdry_idx = (entry->eflags &
 			    MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
 			    MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
 			if (bdry_idx != 0 &&
 			    ((start & (pagesizes[bdry_idx] - 1)) != 0 ||
 			    (end & (pagesizes[bdry_idx] - 1)) != 0)) {
 				vm_map_unlock_read(map);
 				return (KERN_INVALID_ARGUMENT);
 			}
 		}
 		next_entry = vm_map_entry_succ(entry);
 		if (end > entry->end &&
 		    entry->end != next_entry->start) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 	failed = FALSE;
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (entry = first_entry; entry->start < end;) {
 		offset = entry->offset + (start - entry->start);
 		size = (end <= entry->end ? end : entry->end) - start;
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = entry->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = entry->object.vm_object;
 		}
 		vm_object_reference(object);
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		if (!vm_object_sync(object, offset, size, syncio, invalidate))
 			failed = TRUE;
 		start += size;
 		vm_object_deallocate(object);
 		vm_map_lock_read(map);
 		if (last_timestamp == map->timestamp ||
 		    !vm_map_lookup_entry(map, start, &entry))
 			entry = vm_map_entry_succ(entry);
 	}
 
 	vm_map_unlock_read(map);
 	return (failed ? KERN_FAILURE : KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_size_t size;
 
 	VM_MAP_ASSERT_LOCKED(map);
 	KASSERT(entry->wired_count > 0,
 	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
 
 	size = entry->end - entry->start;
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
 		vm_map_wire_user_count_sub(atop(size));
 	pmap_unwire(map->pmap, entry->start, entry->end);
 	vm_object_unwire(entry->object.vm_object, entry->offset, size,
 	    PQ_ACTIVE);
 	entry->wired_count = 0;
 }
 
 static void
 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
 {
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
 		vm_object_deallocate(entry->object.vm_object);
 	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, size1;
 	vm_size_t size;
 
 	vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
 	object = entry->object.vm_object;
 
 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
 		MPASS(entry->cred == NULL);
 		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
 		MPASS(object == NULL);
 		vm_map_entry_deallocate(entry, map->system_map);
 		return;
 	}
 
 	size = entry->end - entry->start;
 	map->size -= size;
 
 	if (entry->cred != NULL) {
 		swap_release_by_cred(size, entry->cred);
 		crfree(entry->cred);
 	}
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
 		entry->object.vm_object = NULL;
 	} else if ((object->flags & OBJ_ANON) != 0 ||
 	    object == kernel_object) {
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + atop(size);
 		VM_OBJECT_WLOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & OBJ_ONEMAPPING) != 0 ||
 		    object == kernel_object)) {
 			vm_object_collapse(object);
 
 			/*
 			 * The option OBJPR_NOTMAPPED can be passed here
 			 * because vm_map_delete() already performed
 			 * pmap_remove() on the only mapping to this range
 			 * of pages. 
 			 */
 			vm_object_page_remove(object, offidxstart, offidxend,
 			    OBJPR_NOTMAPPED);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size) {
 				size1 = object->size;
 				object->size = offidxstart;
 				if (object->cred != NULL) {
 					size1 -= object->size;
 					KASSERT(object->charge >= ptoa(size1),
 					    ("object %p charge < 0", object));
 					swap_release_by_cred(ptoa(size1),
 					    object->cred);
 					object->charge -= ptoa(size1);
 				}
 			}
 		}
 		VM_OBJECT_WUNLOCK(object);
 	}
 	if (map->system_map)
 		vm_map_entry_deallocate(entry, TRUE);
 	else {
 		entry->defer_next = curthread->td_map_def_user;
 		curthread->td_map_def_user = entry;
 	}
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry, next_entry, scratch_entry;
 	int rv;
 
 	VM_MAP_ASSERT_LOCKED(map);
 
 	if (start == end)
 		return (KERN_SUCCESS);
 
 	/*
 	 * Find the start of the region, and clip it.
 	 * Step through all entries in this region.
 	 */
 	rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	for (; entry->start < end; entry = next_entry) {
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, 0);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				rv = vm_map_lookup_clip_start(map, saved_start,
 				    &next_entry, &scratch_entry);
 				if (rv != KERN_SUCCESS)
 					break;
 			} else
 				next_entry = entry;
 			continue;
 		}
 
 		/* XXXKIB or delete to the upper superpage boundary ? */
 		rv = vm_map_clip_end(map, entry, end);
 		if (rv != KERN_SUCCESS)
 			break;
 		next_entry = vm_map_entry_succ(entry);
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0)
 			vm_map_entry_unwire(map, entry);
 
 		/*
 		 * Remove mappings for the pages, but only if the
 		 * mappings could exist.  For instance, it does not
 		 * make sense to call pmap_remove() for guard entries.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
 		    entry->object.vm_object != NULL)
 			pmap_remove(map->pmap, entry->start, entry->end);
 
 		if (entry->end == map->anon_loc)
 			map->anon_loc = entry->start;
 
 		/*
 		 * Delete the entry only after removing all pmap
 		 * entries pointing to its pages.  (Otherwise, its
 		 * page frames may be reallocated, and any modify bits
 		 * will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 	}
 	return (rv);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = vm_map_entry_succ(entry);
 	}
 	return (TRUE);
 }
 
 /*
  *
  *	vm_map_copy_swap_object:
  *
  *	Copies a swap-backed object from an existing map entry to a
  *	new one.  Carries forward the swap charge.  May change the
  *	src object on return.
  */
 static void
 vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
     vm_offset_t size, vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	struct ucred *cred;
 	int charged;
 
 	src_object = src_entry->object.vm_object;
 	charged = ENTRY_CHARGED(src_entry);
 	if ((src_object->flags & OBJ_ANON) != 0) {
 		VM_OBJECT_WLOCK(src_object);
 		vm_object_collapse(src_object);
 		if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
 			vm_object_split(src_entry);
 			src_object = src_entry->object.vm_object;
 		}
 		vm_object_reference_locked(src_object);
 		vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(src_object);
 	} else
 		vm_object_reference(src_object);
 	if (src_entry->cred != NULL &&
 	    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 		KASSERT(src_object->cred == NULL,
 		    ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
 		     src_object));
 		src_object->cred = src_entry->cred;
 		src_object->charge = size;
 	}
 	dst_entry->object.vm_object = src_object;
 	if (charged) {
 		cred = curthread->td_ucred;
 		crhold(cred);
 		dst_entry->cred = cred;
 		*fork_charge += size;
 		if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			crhold(cred);
 			src_entry->cred = cred;
 			*fork_charge += size;
 		}
 	}
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry,
 	vm_ooffset_t *fork_charge)
 {
 	vm_object_t src_object;
 	vm_map_entry_t fake_entry;
 	vm_offset_t size;
 
 	VM_MAP_ASSERT_LOCKED(dst_map);
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0 ||
 	    (src_entry->protection & VM_PROT_WRITE) == 0) {
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
 		    (src_entry->protection & VM_PROT_WRITE) != 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		size = src_entry->end - src_entry->start;
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			if (src_object->type == OBJT_DEFAULT ||
 			    src_object->type == OBJT_SWAP) {
 				vm_map_copy_swap_object(src_entry, dst_entry,
 				    size, fork_charge);
 				/* May have split/collapsed, reload obj. */
 				src_object = src_entry->object.vm_object;
 			} else {
 				vm_object_reference(src_object);
 				dst_entry->object.vm_object = src_object;
 			}
 			src_entry->eflags |= MAP_ENTRY_COW |
 			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->eflags |= MAP_ENTRY_COW |
 			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->offset = src_entry->offset;
 			if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
 				/*
 				 * MAP_ENTRY_WRITECNT cannot
 				 * indicate write reference from
 				 * src_entry, since the entry is
 				 * marked as needs copy.  Allocate a
 				 * fake entry that is used to
 				 * decrement object->un_pager writecount
 				 * at the appropriate time.  Attach
 				 * fake_entry to the deferred list.
 				 */
 				fake_entry = vm_map_entry_create(dst_map);
 				fake_entry->eflags = MAP_ENTRY_WRITECNT;
 				src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
 				vm_object_reference(src_object);
 				fake_entry->object.vm_object = src_object;
 				fake_entry->start = src_entry->start;
 				fake_entry->end = src_entry->end;
 				fake_entry->defer_next =
 				    curthread->td_map_def_user;
 				curthread->td_map_def_user = fake_entry;
 			}
 
 			pmap_copy(dst_map->pmap, src_map->pmap,
 			    dst_entry->start, dst_entry->end - dst_entry->start,
 			    src_entry->start);
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 			if (src_entry->cred != NULL) {
 				dst_entry->cred = curthread->td_ucred;
 				crhold(dst_entry->cred);
 				*fork_charge += size;
 			}
 		}
 	} else {
 		/*
 		 * We don't want to make writeable wired pages copy-on-write.
 		 * Immediately copy these pages into the new map by simulating
 		 * page faults.  The new pages are pageable.
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
 		    fork_charge);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
 		return;
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 {
 	struct vmspace *vm2;
 	vm_map_t new_map, old_map;
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
 	int error, locked;
 	vm_inherit_t inh;
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
 	    pmap_pinit);
 	if (vm2 == NULL)
 		return (NULL);
 
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	vm_map_lock(old_map);
 	if (old_map->busy)
 		vm_map_wait_busy(old_map);
 	new_map = &vm2->vm_map;
 	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
 	KASSERT(locked, ("vmspace_fork: lock failed"));
 
 	error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
 	if (error != 0) {
 		sx_xunlock(&old_map->lock);
 		sx_xunlock(&new_map->lock);
 		vm_map_process_deferred();
 		vmspace_free(vm2);
 		return (NULL);
 	}
 
 	new_map->anon_loc = old_map->anon_loc;
 	new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART);
 
 	VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
 		if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			panic("vm_map_fork: encountered a submap");
 
 		inh = old_entry->inheritance;
 		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
 		    inh != VM_INHERIT_NONE)
 			inh = VM_INHERIT_COPY;
 
 		switch (inh) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if
 			 * necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				vm_map_entry_back(old_entry);
 				object = old_entry->object.vm_object;
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 				    &old_entry->offset,
 				    old_entry->end - old_entry->start,
 				    old_entry->cred,
 				    /* Transfer the second reference too. */
 				    true);
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				old_entry->cred = NULL;
 
 				/*
 				 * As in vm_map_merged_neighbor_dispose(),
 				 * the vnode lock will not be acquired in
 				 * this call to vm_object_deallocate().
 				 */
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			} else {
 				VM_OBJECT_WLOCK(object);
 				vm_object_clear_flag(object, OBJ_ONEMAPPING);
 				if (old_entry->cred != NULL) {
 					KASSERT(object->cred == NULL,
 					    ("vmspace_fork both cred"));
 					object->cred = old_entry->cred;
 					object->charge = old_entry->end -
 					    old_entry->start;
 					old_entry->cred = NULL;
 				}
 
 				/*
 				 * Assert the correct state of the vnode
 				 * v_writecount while the object is locked, to
 				 * not relock it later for the assertion
 				 * correctness.
 				 */
 				if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
 				    object->type == OBJT_VNODE) {
 					KASSERT(((struct vnode *)object->
 					    handle)->v_writecount > 0,
 					    ("vmspace_fork: v_writecount %p",
 					    object));
 					KASSERT(object->un_pager.vnp.
 					    writemappings > 0,
 					    ("vmspace_fork: vnp.writecount %p",
 					    object));
 				}
 				VM_OBJECT_WUNLOCK(object);
 			}
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
 				vm_pager_update_writecount(object,
 				    new_entry->start, new_entry->end);
 			}
 			vm_map_entry_set_vnode_text(new_entry, true);
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			/*
 			 * Copied entry is COW over the old object.
 			 */
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
 			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->cred = NULL;
 			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry, fork_charge);
 			vm_map_entry_set_vnode_text(new_entry, true);
 			break;
 
 		case VM_INHERIT_ZERO:
 			/*
 			 * Create a new anonymous mapping entry modelled from
 			 * the old one.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			memset(new_entry, 0, sizeof(*new_entry));
 
 			new_entry->start = old_entry->start;
 			new_entry->end = old_entry->end;
 			new_entry->eflags = old_entry->eflags &
 			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
 			    MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
 			    MAP_ENTRY_SPLIT_BOUNDARY_MASK);
 			new_entry->protection = old_entry->protection;
 			new_entry->max_protection = old_entry->max_protection;
 			new_entry->inheritance = VM_INHERIT_ZERO;
 
 			vm_map_entry_link(new_map, new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			new_entry->cred = curthread->td_ucred;
 			crhold(new_entry->cred);
 			*fork_charge += (new_entry->end - new_entry->start);
 
 			break;
 		}
 	}
 	/*
 	 * Use inlined vm_map_unlock() to postpone handling the deferred
 	 * map entries, which cannot be done until both old_map and
 	 * new_map locks are released.
 	 */
 	sx_xunlock(&old_map->lock);
 	sx_xunlock(&new_map->lock);
 	vm_map_process_deferred();
 
 	return (vm2);
 }
 
 /*
  * Create a process's stack for exec_new_vmspace().  This function is never
  * asked to wire the newly created stack.
  */
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_size_t growsize, init_ssize;
 	rlim_t vmemlim;
 	int rv;
 
 	MPASS((map->flags & MAP_WIREFUTURE) == 0);
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 	vm_map_lock(map);
 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
 	    max, cow);
 out:
 	vm_map_unlock(map);
 	return (rv);
 }
 
 static int stack_guard_page = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
     &stack_guard_page, 0,
     "Specifies the number of guard pages for a stack that grows");
 
 static int
 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, gap_bot, gap_top, top;
 	vm_size_t init_ssize, sgp;
 	int orient, rv;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
 	KASSERT(orient != 0, ("No stack grow direction"));
 	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
 	    ("bi-dir stack"));
 
 	if (max_ssize == 0 ||
 	    !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
 		return (KERN_INVALID_ADDRESS);
 	sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
 	    (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
 	    (vm_size_t)stack_guard_page * PAGE_SIZE;
 	if (sgp >= max_ssize)
 		return (KERN_INVALID_ARGUMENT);
 
 	init_ssize = growsize;
 	if (max_ssize < init_ssize + sgp)
 		init_ssize = max_ssize - sgp;
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If we can't accommodate max_ssize in the current mapping, no go.
 	 */
 	if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
 		return (KERN_NO_SPACE);
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN) {
 		bot = addrbos + max_ssize - init_ssize;
 		top = bot + init_ssize;
 		gap_bot = addrbos;
 		gap_top = bot;
 	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
 		bot = addrbos;
 		top = bot + init_ssize;
 		gap_bot = top;
 		gap_top = addrbos + max_ssize;
 	}
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	new_entry = vm_map_entry_succ(prev_entry);
 	KASSERT(new_entry->end == top || new_entry->start == bot,
 	    ("Bad entry start/end for new stack entry"));
 	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
 	if (gap_bot == gap_top)
 		return (KERN_SUCCESS);
 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
 	if (rv == KERN_SUCCESS) {
 		/*
 		 * Gap can never successfully handle a fault, so
 		 * read-ahead logic is never used for it.  Re-use
 		 * next_read of the gap entry to store
 		 * stack_guard_page for vm_map_growstack().
 		 */
 		if (orient == MAP_STACK_GROWS_DOWN)
 			vm_map_entry_pred(new_entry)->next_read = sgp;
 		else
 			vm_map_entry_succ(new_entry)->next_read = sgp;
 	} else {
 		(void)vm_map_delete(map, bot, top);
 	}
 	return (rv);
 }
 
 /*
  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
  * successfully grow the stack.
  */
 static int
 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
 {
 	vm_map_entry_t stack_entry;
 	struct proc *p;
 	struct vmspace *vm;
 	struct ucred *cred;
 	vm_offset_t gap_end, gap_start, grow_start;
 	vm_size_t grow_amount, guard, max_grow;
 	rlim_t lmemlim, stacklim, vmemlim;
 	int rv, rv1;
 	bool gap_deleted, grow_down, is_procstack;
 #ifdef notyet
 	uint64_t limit;
 #endif
 #ifdef RACCT
 	int error;
 #endif
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	/*
 	 * Disallow stack growth when the access is performed by a
 	 * debugger or AIO daemon.  The reason is that the wrong
 	 * resource limits are applied.
 	 */
 	if (p != initproc && (map != &p->p_vmspace->vm_map ||
 	    p->p_textvp == NULL))
 		return (KERN_FAILURE);
 
 	MPASS(!map->system_map);
 
 	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
 	stacklim = lim_cur(curthread, RLIMIT_STACK);
 	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 retry:
 	/* If addr is not in a hole for a stack grow area, no need to grow. */
 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
 		return (KERN_FAILURE);
 	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		return (KERN_SUCCESS);
 	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
 		stack_entry = vm_map_entry_succ(gap_entry);
 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
 		    stack_entry->start != gap_entry->end)
 			return (KERN_FAILURE);
 		grow_amount = round_page(stack_entry->start - addr);
 		grow_down = true;
 	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
 		stack_entry = vm_map_entry_pred(gap_entry);
 		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
 		    stack_entry->end != gap_entry->start)
 			return (KERN_FAILURE);
 		grow_amount = round_page(addr + 1 - stack_entry->end);
 		grow_down = false;
 	} else {
 		return (KERN_FAILURE);
 	}
 	guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
 	    (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
 	    gap_entry->next_read;
 	max_grow = gap_entry->end - gap_entry->start;
 	if (guard > max_grow)
 		return (KERN_NO_SPACE);
 	max_grow -= guard;
 	if (grow_amount > max_grow)
 		return (KERN_NO_SPACE);
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
 	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
 		return (KERN_NO_SPACE);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		if (is_procstack && racct_set(p, RACCT_STACK,
 		    ctob(vm->vm_ssize) + grow_amount)) {
 			PROC_UNLOCK(p);
 			return (KERN_NO_SPACE);
 		}
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	grow_amount = roundup(grow_amount, sgrowsiz);
 	if (grow_amount > max_grow)
 		grow_amount = max_grow;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = trunc_page((vm_size_t)stacklim) -
 		    ctob(vm->vm_ssize);
 	}
 
 #ifdef notyet
 	PROC_LOCK(p);
 	limit = racct_get_available(p, RACCT_STACK);
 	PROC_UNLOCK(p);
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
 		grow_amount = limit - ctob(vm->vm_ssize);
 #endif
 
 	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(p);
 			if (racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
 				PROC_UNLOCK(p);
 				rv = KERN_NO_SPACE;
 				goto out;
 			}
 			PROC_UNLOCK(p);
 		}
 #endif
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
 			PROC_UNLOCK(p);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	if (vm_map_lock_upgrade(map)) {
 		gap_entry = NULL;
 		vm_map_lock_read(map);
 		goto retry;
 	}
 
 	if (grow_down) {
 		grow_start = gap_entry->end - grow_amount;
 		if (gap_entry->start + grow_amount == gap_entry->end) {
 			gap_start = gap_entry->start;
 			gap_end = gap_entry->end;
 			vm_map_entry_delete(map, gap_entry);
 			gap_deleted = true;
 		} else {
 			MPASS(gap_entry->start < gap_entry->end - grow_amount);
 			vm_map_entry_resize(map, gap_entry, -grow_amount);
 			gap_deleted = false;
 		}
 		rv = vm_map_insert(map, NULL, 0, grow_start,
 		    grow_start + grow_amount,
 		    stack_entry->protection, stack_entry->max_protection,
 		    MAP_STACK_GROWS_DOWN);
 		if (rv != KERN_SUCCESS) {
 			if (gap_deleted) {
 				rv1 = vm_map_insert(map, NULL, 0, gap_start,
 				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
 				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
 				MPASS(rv1 == KERN_SUCCESS);
 			} else
 				vm_map_entry_resize(map, gap_entry,
 				    grow_amount);
 		}
 	} else {
 		grow_start = stack_entry->end;
 		cred = stack_entry->cred;
 		if (cred == NULL && stack_entry->object.vm_object != NULL)
 			cred = stack_entry->object.vm_object->cred;
 		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
 			rv = KERN_NO_SPACE;
 		/* Grow the underlying object if applicable. */
 		else if (stack_entry->object.vm_object == NULL ||
 		    vm_object_coalesce(stack_entry->object.vm_object,
 		    stack_entry->offset,
 		    (vm_size_t)(stack_entry->end - stack_entry->start),
 		    grow_amount, cred != NULL)) {
 			if (gap_entry->start + grow_amount == gap_entry->end) {
 				vm_map_entry_delete(map, gap_entry);
 				vm_map_entry_resize(map, stack_entry,
 				    grow_amount);
 			} else {
 				gap_entry->start += grow_amount;
 				stack_entry->end += grow_amount;
 			}
 			map->size += grow_amount;
 			rv = KERN_SUCCESS;
 		} else
 			rv = KERN_FAILURE;
 	}
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
 		rv = vm_map_wire_locked(map, grow_start,
 		    grow_start + grow_amount,
 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 	}
 	vm_map_lock_downgrade(map);
 
 out:
 #ifdef RACCT
 	if (racct_enable && rv != KERN_SUCCESS) {
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_VMEM, map->size);
 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
 		if (!old_mlock) {
 			error = racct_set(p, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)));
 			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
 		}
 	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
 		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("vmspace_exec recursed"));
 	newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	curthread->td_pflags |= TDP_EXECVMSPC;
 	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 	vm_ooffset_t fork_charge;
 
-	if (oldvmspace->vm_refcnt == 1)
+	if (refcount_load(&oldvmspace->vm_refcnt) == 1)
 		return (0);
 	fork_charge = 0;
 	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
 		vmspace_free(newvmspace);
 		return (ENOMEM);
 	}
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 	return (0);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type;
 	vm_object_t eobject;
 	vm_size_t size;
 	struct ucred *cred;
 
 RetryLookup:
 
 	vm_map_lock_read(map);
 
 RetryLookupLocked:
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	entry = *out_entry;
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
 		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
 		if (prot == VM_PROT_NONE && map != kernel_map &&
 		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
 		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
 		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
 		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
 			goto RetryLookupLocked;
 	}
 	fault_type = fault_typea & VM_PROT_ALL;
 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
 	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
 	    ("entry %p flags %x", entry, entry->eflags));
 	if ((fault_typea & VM_PROT_COPY) != 0 &&
 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 	size = entry->end - entry->start;
 
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if ((fault_type & VM_PROT_WRITE) != 0 ||
 		    (fault_typea & VM_PROT_COPY) != 0) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			if (entry->cred == NULL) {
 				/*
 				 * The debugger owner is charged for
 				 * the memory.
 				 */
 				cred = curthread->td_ucred;
 				crhold(cred);
 				if (!swap_reserve_by_cred(size, cred)) {
 					crfree(cred);
 					vm_map_unlock(map);
 					return (KERN_RESOURCE_SHORTAGE);
 				}
 				entry->cred = cred;
 			}
 			eobject = entry->object.vm_object;
 			vm_object_shadow(&entry->object.vm_object,
 			    &entry->offset, size, entry->cred, false);
 			if (eobject == entry->object.vm_object) {
 				/*
 				 * The object was not shadowed.
 				 */
 				swap_release_by_cred(size, entry->cred);
 				crfree(entry->cred);
 			}
 			entry->cred = NULL;
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate_anon(atop(size),
 		    NULL, entry->cred, entry->cred != NULL ? size : 0);
 		entry->offset = 0;
 		entry->cred = NULL;
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 	if (!vm_map_lookup_entry(map, vaddr, out_entry))
 		return (KERN_INVALID_ADDRESS);
 
 	entry = *out_entry;
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 vm_offset_t
 vm_map_max_KBI(const struct vm_map *map)
 {
 
 	return (vm_map_max(map));
 }
 
 vm_offset_t
 vm_map_min_KBI(const struct vm_map *map)
 {
 
 	return (vm_map_min(map));
 }
 
 pmap_t
 vm_map_pmap_KBI(vm_map_t map)
 {
 
 	return (map->pmap);
 }
 
 bool
 vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 
 	return (vm_map_range_valid(map, start, end));
 }
 
 #ifdef INVARIANTS
 static void
 _vm_map_assert_consistent(vm_map_t map, int check)
 {
 	vm_map_entry_t entry, prev;
 	vm_map_entry_t cur, header, lbound, ubound;
 	vm_size_t max_left, max_right;
 
 #ifdef DIAGNOSTIC
 	++map->nupdates;
 #endif
 	if (enable_vmmap_check != check)
 		return;
 
 	header = prev = &map->header;
 	VM_MAP_ENTRY_FOREACH(entry, map) {
 		KASSERT(prev->end <= entry->start,
 		    ("map %p prev->end = %jx, start = %jx", map,
 		    (uintmax_t)prev->end, (uintmax_t)entry->start));
 		KASSERT(entry->start < entry->end,
 		    ("map %p start = %jx, end = %jx", map,
 		    (uintmax_t)entry->start, (uintmax_t)entry->end));
 		KASSERT(entry->left == header ||
 		    entry->left->start < entry->start,
 		    ("map %p left->start = %jx, start = %jx", map,
 		    (uintmax_t)entry->left->start, (uintmax_t)entry->start));
 		KASSERT(entry->right == header ||
 		    entry->start < entry->right->start,
 		    ("map %p start = %jx, right->start = %jx", map,
 		    (uintmax_t)entry->start, (uintmax_t)entry->right->start));
 		cur = map->root;
 		lbound = ubound = header;
 		for (;;) {
 			if (entry->start < cur->start) {
 				ubound = cur;
 				cur = cur->left;
 				KASSERT(cur != lbound,
 				    ("map %p cannot find %jx",
 				    map, (uintmax_t)entry->start));
 			} else if (cur->end <= entry->start) {
 				lbound = cur;
 				cur = cur->right;
 				KASSERT(cur != ubound,
 				    ("map %p cannot find %jx",
 				    map, (uintmax_t)entry->start));
 			} else {
 				KASSERT(cur == entry,
 				    ("map %p cannot find %jx",
 				    map, (uintmax_t)entry->start));
 				break;
 			}
 		}
 		max_left = vm_map_entry_max_free_left(entry, lbound);
 		max_right = vm_map_entry_max_free_right(entry, ubound);
 		KASSERT(entry->max_free == vm_size_max(max_left, max_right),
 		    ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
 		    (uintmax_t)entry->max_free,
 		    (uintmax_t)max_left, (uintmax_t)max_right));
 		prev = entry;
 	}
 	KASSERT(prev->end <= entry->start,
 	    ("map %p prev->end = %jx, start = %jx", map,
 	    (uintmax_t)prev->end, (uintmax_t)entry->start));
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 static void
 vm_map_print(vm_map_t map)
 {
 	vm_map_entry_t entry, prev;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 
 	db_indent += 2;
 	prev = &map->header;
 	VM_MAP_ENTRY_FOREACH(entry, map) {
 		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end,
 		    entry->eflags);
 		{
 			static const char * const inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)
 			    entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			if (prev == &map->header ||
 			    prev->object.sub_map !=
 				entry->object.sub_map) {
 				db_indent += 2;
 				vm_map_print((vm_map_t)entry->object.sub_map);
 				db_indent -= 2;
 			}
 		} else {
 			if (entry->cred != NULL)
 				db_printf(", ruid %d", entry->cred->cr_ruid);
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->object.vm_object && entry->object.vm_object->cred)
 				db_printf(", obj ruid %d charge %jx",
 				    entry->object.vm_object->cred->cr_ruid,
 				    (uintmax_t)entry->object.vm_object->charge);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 
 			if (prev == &map->header ||
 			    prev->object.vm_object !=
 				entry->object.vm_object) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						0, 0, (char *)0);
 				db_indent -= 2;
 			}
 		}
 		prev = entry;
 	}
 	db_indent -= 2;
 }
 
 DB_SHOW_COMMAND(map, map)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show map <addr>\n");
 		return;
 	}
 	vm_map_print((vm_map_t)addr);
 }
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = db_lookup_proc(addr);
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
 }
 
 #endif /* DDB */
Index: head/sys/vm/vm_map.h
===================================================================
--- head/sys/vm/vm_map.h	(revision 367333)
+++ head/sys/vm/vm_map.h	(revision 367334)
@@ -1,528 +1,528 @@
 /*-
  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_map.h	8.9 (Berkeley) 5/17/95
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory map module definitions.
  */
 #ifndef	_VM_MAP_
 #define	_VM_MAP_
 
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/_mutex.h>
 
 /*
  *	Types defined:
  *
  *	vm_map_t		the high-level address map data structure.
  *	vm_map_entry_t		an entry in an address map.
  */
 
 typedef u_char vm_flags_t;
 typedef u_int vm_eflags_t;
 
 /*
  *	Objects which live in maps may be either VM objects, or
  *	another map (called a "sharing map") which denotes read-write
  *	sharing with other maps.
  */
 union vm_map_object {
 	struct vm_object *vm_object;	/* object object */
 	struct vm_map *sub_map;		/* belongs to another map */
 };
 
 /*
  *	Address map entries consist of start and end addresses,
  *	a VM object (or sharing map) and offset into that object,
  *	and user-exported inheritance and protection information.
  *	Also included is control information for virtual copy operations.
  */
 struct vm_map_entry {
 	struct vm_map_entry *left;	/* left child or previous entry */
 	struct vm_map_entry *right;	/* right child or next entry */
 	vm_offset_t start;		/* start address */
 	vm_offset_t end;		/* end address */
 	vm_offset_t next_read;		/* vaddr of the next sequential read */
 	vm_size_t max_free;		/* max free space in subtree */
 	union vm_map_object object;	/* object I point to */
 	vm_ooffset_t offset;		/* offset into object */
 	vm_eflags_t eflags;		/* map entry flags */
 	vm_prot_t protection;		/* protection code */
 	vm_prot_t max_protection;	/* maximum protection */
 	vm_inherit_t inheritance;	/* inheritance */
 	uint8_t read_ahead;		/* pages in the read-ahead window */
 	int wired_count;		/* can be paged if = 0 */
 	struct ucred *cred;		/* tmp storage for creator ref */
 	struct thread *wiring_thread;
 };
 
 #define	MAP_ENTRY_NOSYNC		0x00000001
 #define	MAP_ENTRY_IS_SUB_MAP		0x00000002
 #define	MAP_ENTRY_COW			0x00000004
 #define	MAP_ENTRY_NEEDS_COPY		0x00000008
 #define	MAP_ENTRY_NOFAULT		0x00000010
 #define	MAP_ENTRY_USER_WIRED		0x00000020
 
 #define	MAP_ENTRY_BEHAV_NORMAL		0x00000000	/* default behavior */
 #define	MAP_ENTRY_BEHAV_SEQUENTIAL	0x00000040	/* expect sequential
 							   access */
 #define	MAP_ENTRY_BEHAV_RANDOM		0x00000080	/* expect random
 							   access */
 #define	MAP_ENTRY_BEHAV_RESERVED	0x000000c0	/* future use */
 #define	MAP_ENTRY_BEHAV_MASK		0x000000c0
 #define	MAP_ENTRY_IN_TRANSITION		0x00000100	/* entry being
 							   changed */
 #define	MAP_ENTRY_NEEDS_WAKEUP		0x00000200	/* waiters in
 							   transition */
 #define	MAP_ENTRY_NOCOREDUMP		0x00000400	/* don't include in
 							   a core */
 #define	MAP_ENTRY_VN_EXEC		0x00000800	/* text vnode mapping */
 #define	MAP_ENTRY_GROWS_DOWN		0x00001000	/* top-down stacks */
 #define	MAP_ENTRY_GROWS_UP		0x00002000	/* bottom-up stacks */
 
 #define	MAP_ENTRY_WIRE_SKIPPED		0x00004000
 #define	MAP_ENTRY_WRITECNT		0x00008000	/* tracked writeable
 							   mapping */
 #define	MAP_ENTRY_GUARD			0x00010000
 #define	MAP_ENTRY_STACK_GAP_DN		0x00020000
 #define	MAP_ENTRY_STACK_GAP_UP		0x00040000
 #define	MAP_ENTRY_HEADER		0x00080000
 
 #define	MAP_ENTRY_SPLIT_BOUNDARY_MASK	0x00300000
 
 #define	MAP_ENTRY_SPLIT_BOUNDARY_SHIFT	20
 
 #ifdef	_KERNEL
 static __inline u_char
 vm_map_entry_behavior(vm_map_entry_t entry)
 {
 	return (entry->eflags & MAP_ENTRY_BEHAV_MASK);
 }
 
 static __inline int
 vm_map_entry_user_wired_count(vm_map_entry_t entry)
 {
 	if (entry->eflags & MAP_ENTRY_USER_WIRED)
 		return (1);
 	return (0);
 }
 
 static __inline int
 vm_map_entry_system_wired_count(vm_map_entry_t entry)
 {
 	return (entry->wired_count - vm_map_entry_user_wired_count(entry));
 }
 #endif	/* _KERNEL */
 
 /*
  *	A map is a set of map entries.  These map entries are
  *	organized as a threaded binary search tree.  Both structures
  *	are ordered based upon the start and end addresses contained
  *	within each map entry.  The largest gap between an entry in a
  *	subtree and one of its neighbors is saved in the max_free
  *	field, and that field is updated when the tree is
  *	restructured.
  *
  *	Sleator and Tarjan's top-down splay algorithm is employed to
  *	control height imbalance in the binary search tree.
  *
  *	The map's min offset value is stored in map->header.end, and
  *	its max offset value is stored in map->header.start.  These
  *	values act as sentinels for any forward or backward address
  *	scan of the list.  The right and left fields of the map
  *	header point to the first and list map entries.  The map
  *	header has a special value for the eflags field,
  *	MAP_ENTRY_HEADER, that is set initially, is never changed,
  *	and prevents an eflags match of the header with any other map
  *	entry.
  *
  *	List of locks
  *	(c)	const until freed
  */
 struct vm_map {
 	struct vm_map_entry header;	/* List of entries */
 	struct sx lock;			/* Lock for map data */
 	struct mtx system_mtx;
 	int nentries;			/* Number of entries */
 	vm_size_t size;			/* virtual size */
 	u_int timestamp;		/* Version number */
 	u_char needs_wakeup;
 	u_char system_map;		/* (c) Am I a system map? */
 	vm_flags_t flags;		/* flags for this vm_map */
 	vm_map_entry_t root;		/* Root of a binary search tree */
 	pmap_t pmap;			/* (c) Physical map */
 	vm_offset_t anon_loc;
 	int busy;
 #ifdef DIAGNOSTIC
 	int nupdates;
 #endif
 };
 
 /*
  * vm_flags_t values
  */
 #define MAP_WIREFUTURE		0x01	/* wire all future pages */
 #define	MAP_BUSY_WAKEUP		0x02
 #define	MAP_IS_SUB_MAP		0x04	/* has parent */
 #define	MAP_ASLR		0x08	/* enabled ASLR */
 #define	MAP_ASLR_IGNSTART	0x10
 
 #ifdef	_KERNEL
 #if defined(KLD_MODULE) && !defined(KLD_TIED)
 #define	vm_map_max(map)		vm_map_max_KBI((map))
 #define	vm_map_min(map)		vm_map_min_KBI((map))
 #define	vm_map_pmap(map)	vm_map_pmap_KBI((map))
 #define	vm_map_range_valid(map, start, end)	\
 	vm_map_range_valid_KBI((map), (start), (end))
 #else
 static __inline vm_offset_t
 vm_map_max(const struct vm_map *map)
 {
 
 	return (map->header.start);
 }
 
 static __inline vm_offset_t
 vm_map_min(const struct vm_map *map)
 {
 
 	return (map->header.end);
 }
 
 static __inline pmap_t
 vm_map_pmap(vm_map_t map)
 {
 	return (map->pmap);
 }
 
 static __inline void
 vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear)
 {
 	map->flags = (map->flags | set) & ~clear;
 }
 
 static inline bool
 vm_map_range_valid(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	if (end < start)
 		return (false);
 	if (start < vm_map_min(map) || end > vm_map_max(map))
 		return (false);
 	return (true);
 }
 
 #endif	/* KLD_MODULE */
 #endif	/* _KERNEL */
 
 /*
  * Shareable process virtual address space.
  *
  * List of locks
  *	(c)	const until freed
  */
 struct vmspace {
 	struct vm_map vm_map;	/* VM address map */
 	struct shmmap_state *vm_shm;	/* SYS5 shared memory private data XXX */
 	segsz_t vm_swrss;	/* resident set size before last swap */
 	segsz_t vm_tsize;	/* text size (pages) XXX */
 	segsz_t vm_dsize;	/* data size (pages) XXX */
 	segsz_t vm_ssize;	/* stack size (pages) */
 	caddr_t vm_taddr;	/* (c) user virtual address of text */
 	caddr_t vm_daddr;	/* (c) user virtual address of data */
 	caddr_t vm_maxsaddr;	/* user VA at max stack growth */
-	volatile int vm_refcnt;	/* number of references */
+	u_int vm_refcnt;	/* number of references */
 	/*
 	 * Keep the PMAP last, so that CPU-specific variations of that
 	 * structure on a single architecture don't result in offset
 	 * variations of the machine-independent fields in the vmspace.
 	 */
 	struct pmap vm_pmap;	/* private physical map */
 };
 
 #ifdef	_KERNEL
 static __inline pmap_t
 vmspace_pmap(struct vmspace *vmspace)
 {
 	return &vmspace->vm_pmap;
 }
 #endif	/* _KERNEL */
 
 #ifdef	_KERNEL
 /*
  *	Macros:		vm_map_lock, etc.
  *	Function:
  *		Perform locking on the data portion of a map.  Note that
  *		these macros mimic procedure calls returning void.  The
  *		semicolon is supplied by the user of these macros, not
  *		by the macros themselves.  The macros can safely be used
  *		as unbraced elements in a higher level statement.
  */
 
 void _vm_map_lock(vm_map_t map, const char *file, int line);
 void _vm_map_unlock(vm_map_t map, const char *file, int line);
 int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line);
 void _vm_map_lock_read(vm_map_t map, const char *file, int line);
 void _vm_map_unlock_read(vm_map_t map, const char *file, int line);
 int _vm_map_trylock(vm_map_t map, const char *file, int line);
 int _vm_map_trylock_read(vm_map_t map, const char *file, int line);
 int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line);
 void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line);
 int vm_map_locked(vm_map_t map);
 void vm_map_wakeup(vm_map_t map);
 void vm_map_busy(vm_map_t map);
 void vm_map_unbusy(vm_map_t map);
 void vm_map_wait_busy(vm_map_t map);
 vm_offset_t vm_map_max_KBI(const struct vm_map *map);
 vm_offset_t vm_map_min_KBI(const struct vm_map *map);
 pmap_t vm_map_pmap_KBI(vm_map_t map);
 bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end);
 
 #define	vm_map_lock(map)	_vm_map_lock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock(map)	_vm_map_unlock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock_and_wait(map, timo)	\
 			_vm_map_unlock_and_wait(map, timo, LOCK_FILE, LOCK_LINE)
 #define	vm_map_lock_read(map)	_vm_map_lock_read(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock_read(map)	_vm_map_unlock_read(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_trylock(map)	_vm_map_trylock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_trylock_read(map)	\
 			_vm_map_trylock_read(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_lock_upgrade(map)	\
 			_vm_map_lock_upgrade(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_lock_downgrade(map)	\
 			_vm_map_lock_downgrade(map, LOCK_FILE, LOCK_LINE)
 
 long vmspace_resident_count(struct vmspace *vmspace);
 #endif	/* _KERNEL */
 
 /*
  * Copy-on-write flags for vm_map operations
  */
 #define	MAP_INHERIT_SHARE	0x00000001
 #define	MAP_COPY_ON_WRITE	0x00000002
 #define	MAP_NOFAULT		0x00000004
 #define	MAP_PREFAULT		0x00000008
 #define	MAP_PREFAULT_PARTIAL	0x00000010
 #define	MAP_DISABLE_SYNCER	0x00000020
 #define	MAP_CHECK_EXCL		0x00000040
 #define	MAP_CREATE_GUARD	0x00000080
 #define	MAP_DISABLE_COREDUMP	0x00000100
 #define	MAP_PREFAULT_MADVISE	0x00000200    /* from (user) madvise request */
 #define	MAP_WRITECOUNT		0x00000400
 #define	MAP_REMAP		0x00000800
 #define	MAP_STACK_GROWS_DOWN	0x00001000
 #define	MAP_STACK_GROWS_UP	0x00002000
 #define	MAP_ACC_CHARGED		0x00004000
 #define	MAP_ACC_NO_CHARGE	0x00008000
 #define	MAP_CREATE_STACK_GAP_UP	0x00010000
 #define	MAP_CREATE_STACK_GAP_DN	0x00020000
 #define	MAP_VN_EXEC		0x00040000
 #define	MAP_SPLIT_BOUNDARY_MASK	0x00180000
 
 #define	MAP_SPLIT_BOUNDARY_SHIFT 19
 
 /*
  * vm_fault option flags
  */
 #define	VM_FAULT_NORMAL	0x00	/* Nothing special */
 #define	VM_FAULT_WIRE	0x01	/* Wire the mapped page */
 #define	VM_FAULT_DIRTY	0x02	/* Dirty the page; use w/VM_PROT_COPY */
 #define	VM_FAULT_NOFILL	0x04	/* Fail if the pager doesn't have a copy */
 
 /*
  * Initially, mappings are slightly sequential.  The maximum window size must
  * account for the map entry's "read_ahead" field being defined as an uint8_t.
  */
 #define	VM_FAULT_READ_AHEAD_MIN		7
 #define	VM_FAULT_READ_AHEAD_INIT	15
 #define	VM_FAULT_READ_AHEAD_MAX		min(atop(MAXPHYS) - 1, UINT8_MAX)
 
 /*
  * The following "find_space" options are supported by vm_map_find().
  *
  * For VMFS_ALIGNED_SPACE, the desired alignment is specified to
  * the macro argument as log base 2 of the desired alignment.
  */
 #define	VMFS_NO_SPACE		0	/* don't find; use the given range */
 #define	VMFS_ANY_SPACE		1	/* find a range with any alignment */
 #define	VMFS_OPTIMAL_SPACE	2	/* find a range with optimal alignment*/
 #define	VMFS_SUPER_SPACE	3	/* find a superpage-aligned range */
 #define	VMFS_ALIGNED_SPACE(x)	((x) << 8) /* find a range with fixed alignment */
 
 /*
  * vm_map_wire and vm_map_unwire option flags
  */
 #define VM_MAP_WIRE_SYSTEM	0	/* wiring in a kernel map */
 #define VM_MAP_WIRE_USER	1	/* wiring in a user map */
 
 #define VM_MAP_WIRE_NOHOLES	0	/* region must not have holes */
 #define VM_MAP_WIRE_HOLESOK	2	/* region may have holes */
 
 #define VM_MAP_WIRE_WRITE	4	/* Validate writable. */
 
 typedef int vm_map_entry_reader(void *token, vm_map_entry_t addr, 
     vm_map_entry_t dest);
 
 #ifndef _KERNEL
 /*
  * Find the successor of a map_entry, using a reader to dereference pointers.
  * '*clone' is a copy of a vm_map entry.  'reader' is used to copy a map entry
  * at some address into '*clone'.  Change *clone to a copy of the next map
  * entry, and return the address of that entry, or NULL if copying has failed.
  *
  * This function is made available to user-space code that needs to traverse
  * map entries.
  */
 static inline vm_map_entry_t
 vm_map_entry_read_succ(void *token, struct vm_map_entry *const clone,
     vm_map_entry_reader reader)
 {
 	vm_map_entry_t after, backup;
 	vm_offset_t start;
 
 	after = clone->right;
 	start = clone->start;
 	if (!reader(token, after, clone))
 		return (NULL);
 	backup = clone->left;
 	if (!reader(token, backup, clone))
 		return (NULL);
 	if (clone->start > start) {
 		do {
 			after = backup;
 			backup = clone->left;
 			if (!reader(token, backup, clone))
 				return (NULL);
 		} while (clone->start != start);
 	}
 	if (!reader(token, after, clone))
 		return (NULL);
 	return (after);
 }
 #endif				/* ! _KERNEL */
 
 #ifdef _KERNEL
 boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t);
 int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t);
 int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
     vm_offset_t, int, vm_prot_t, vm_prot_t, int);
 int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
     vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
 int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
     vm_offset_t max_addr, vm_offset_t alignment);
 int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
     vm_prot_t, vm_prot_t, int);
 vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t);
 int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t);
 void vm_map_init(vm_map_t, pmap_t, vm_offset_t, vm_offset_t);
 int vm_map_insert (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int);
 int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *,
     vm_pindex_t *, vm_prot_t *, boolean_t *);
 int vm_map_lookup_locked(vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *,
     vm_pindex_t *, vm_prot_t *, boolean_t *);
 void vm_map_lookup_done (vm_map_t, vm_map_entry_t);
 boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);
 
 static inline vm_map_entry_t
 vm_map_entry_first(vm_map_t map)
 {
 
 	return (map->header.right);
 }
 
 static inline vm_map_entry_t
 vm_map_entry_succ(vm_map_entry_t entry)
 {
 	vm_map_entry_t after;
 
 	after = entry->right;
 	if (after->left->start > entry->start) {
 		do
 			after = after->left;
 		while (after->left != entry);
 	}
 	return (after);
 }
 
 #define VM_MAP_ENTRY_FOREACH(it, map)		\
 	for ((it) = vm_map_entry_first(map);	\
 	    (it) != &(map)->header;		\
 	    (it) = vm_map_entry_succ(it))
 int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t);
 int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t);
 void vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev,
     vm_map_entry_t entry);
 void vm_map_startup (void);
 int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t);
 int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
 int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
 int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
 int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);
 int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags);
 int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);
 long vmspace_swap_count(struct vmspace *vmspace);
 void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add);
 #endif				/* _KERNEL */
 #endif				/* _VM_MAP_ */