Index: head/sys/dev/cxgbe/tom/t4_ddp.c =================================================================== --- head/sys/dev/cxgbe/tom/t4_ddp.c (revision 367333) +++ head/sys/dev/cxgbe/tom/t4_ddp.c (revision 367334) @@ -1,1943 +1,1943 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" /* * Use the 'backend3' field in AIO jobs to store the amount of data * received by the AIO job so far. */ #define aio_received backend3 static void aio_ddp_requeue_task(void *context, int pending); static void ddp_complete_all(struct toepcb *toep, int error); static void t4_aio_cancel_active(struct kaiocb *job); static void t4_aio_cancel_queued(struct kaiocb *job); static TAILQ_HEAD(, pageset) ddp_orphan_pagesets; static struct mtx ddp_orphan_pagesets_lock; static struct task ddp_orphan_task; #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) /* * A page set holds information about a buffer used for DDP. The page * set holds resources such as the VM pages backing the buffer (either * held or wired) and the page pods associated with the buffer. * Recently used page sets are cached to allow for efficient reuse of * buffers (avoiding the need to re-fault in pages, hold them, etc.). * Note that cached page sets keep the backing pages wired. The * number of wired pages is capped by only allowing for two wired * pagesets per connection. This is not a perfect cap, but is a * trade-off for performance. * * If an application ping-pongs two buffers for a connection via * aio_read(2) then those buffers should remain wired and expensive VM * fault lookups should be avoided after each buffer has been used * once. If an application uses more than two buffers then this will * fall back to doing expensive VM fault lookups for each operation. */ static void free_pageset(struct tom_data *td, struct pageset *ps) { vm_page_t p; int i; if (ps->prsv.prsv_nppods > 0) t4_free_page_pods(&ps->prsv); for (i = 0; i < ps->npages; i++) { p = ps->pages[i]; vm_page_unwire(p, PQ_INACTIVE); } mtx_lock(&ddp_orphan_pagesets_lock); TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link); taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task); mtx_unlock(&ddp_orphan_pagesets_lock); } static void ddp_free_orphan_pagesets(void *context, int pending) { struct pageset *ps; mtx_lock(&ddp_orphan_pagesets_lock); while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) { ps = TAILQ_FIRST(&ddp_orphan_pagesets); TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link); mtx_unlock(&ddp_orphan_pagesets_lock); if (ps->vm) vmspace_free(ps->vm); free(ps, M_CXGBE); mtx_lock(&ddp_orphan_pagesets_lock); } mtx_unlock(&ddp_orphan_pagesets_lock); } static void recycle_pageset(struct toepcb *toep, struct pageset *ps) { DDP_ASSERT_LOCKED(toep); if (!(toep->ddp.flags & DDP_DEAD)) { KASSERT(toep->ddp.cached_count + toep->ddp.active_count < nitems(toep->ddp.db), ("too many wired pagesets")); TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link); toep->ddp.cached_count++; } else free_pageset(toep->td, ps); } static void ddp_complete_one(struct kaiocb *job, int error) { long copied; /* * If this job had copied data out of the socket buffer before * it was cancelled, report it as a short read rather than an * error. */ copied = job->aio_received; if (copied != 0 || error == 0) aio_complete(job, copied, 0); else aio_complete(job, -1, error); } static void free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) { if (db->job) { /* * XXX: If we are un-offloading the socket then we * should requeue these on the socket somehow. If we * got a FIN from the remote end, then this completes * any remaining requests with an EOF read. */ if (!aio_clear_cancel_function(db->job)) ddp_complete_one(db->job, 0); } if (db->ps) free_pageset(td, db->ps); } void ddp_init_toep(struct toepcb *toep) { TAILQ_INIT(&toep->ddp.aiojobq); TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep); toep->ddp.flags = DDP_OK; toep->ddp.active_id = -1; mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF); } void ddp_uninit_toep(struct toepcb *toep) { mtx_destroy(&toep->ddp.lock); } void release_ddp_resources(struct toepcb *toep) { struct pageset *ps; int i; DDP_LOCK(toep); toep->ddp.flags |= DDP_DEAD; for (i = 0; i < nitems(toep->ddp.db); i++) { free_ddp_buffer(toep->td, &toep->ddp.db[i]); } while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) { TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); free_pageset(toep->td, ps); } ddp_complete_all(toep, 0); DDP_UNLOCK(toep); } #ifdef INVARIANTS void ddp_assert_empty(struct toepcb *toep) { int i; MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE)); for (i = 0; i < nitems(toep->ddp.db); i++) { MPASS(toep->ddp.db[i].job == NULL); MPASS(toep->ddp.db[i].ps == NULL); } MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets)); MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq)); } #endif static void complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db, unsigned int db_idx) { unsigned int db_flag; toep->ddp.active_count--; if (toep->ddp.active_id == db_idx) { if (toep->ddp.active_count == 0) { KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL, ("%s: active_count mismatch", __func__)); toep->ddp.active_id = -1; } else toep->ddp.active_id ^= 1; #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__, toep->tid, toep->ddp.active_id); #endif } else { KASSERT(toep->ddp.active_count != 0 && toep->ddp.active_id != -1, ("%s: active count mismatch", __func__)); } db->cancel_pending = 0; db->job = NULL; recycle_pageset(toep, db->ps); db->ps = NULL; db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; KASSERT(toep->ddp.flags & db_flag, ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x", __func__, toep, toep->ddp.flags)); toep->ddp.flags &= ~db_flag; } /* XXX: handle_ddp_data code duplication */ void insert_ddp_data(struct toepcb *toep, uint32_t n) { struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct ddp_buffer *db; struct kaiocb *job; size_t placed; long copied; unsigned int db_flag, db_idx; INP_WLOCK_ASSERT(inp); DDP_ASSERT_LOCKED(toep); tp->rcv_nxt += n; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); tp->rcv_wnd -= n; #endif CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP", __func__, n); while (toep->ddp.active_count > 0) { MPASS(toep->ddp.active_id != -1); db_idx = toep->ddp.active_id; db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; MPASS((toep->ddp.flags & db_flag) != 0); db = &toep->ddp.db[db_idx]; job = db->job; copied = job->aio_received; placed = n; if (placed > job->uaiocb.aio_nbytes - copied) placed = job->uaiocb.aio_nbytes - copied; if (placed > 0) job->msgrcv = 1; if (!aio_clear_cancel_function(job)) { /* * Update the copied length for when * t4_aio_cancel_active() completes this * request. */ job->aio_received += placed; } else if (copied + placed != 0) { CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %lu)", __func__, job, copied, placed); /* XXX: This always completes if there is some data. */ aio_complete(job, copied + placed, 0); } else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) { TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; } else aio_cancel(job); n -= placed; complete_ddp_buffer(toep, db, db_idx); } MPASS(n == 0); } /* SET_TCB_FIELD sent as a ULP command looks like this */ #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) /* RX_DATA_ACK sent as a ULP command looks like this */ #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) static inline void * mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, uint64_t word, uint64_t mask, uint64_t val) { struct ulptx_idata *ulpsc; struct cpl_set_tcb_field_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); req->reply_ctrl = htobe16(V_NO_REPLY(1) | V_QUEUENO(toep->ofld_rxq->iq.abs_id)); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); req->mask = htobe64(mask); req->val = htobe64(val); ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__SET_TCB_FIELD_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static inline void * mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) { struct ulptx_idata *ulpsc; struct cpl_rx_data_ack_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); req->credit_dack = htobe32(F_RX_MODULATE_RX); ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__RX_DATA_ACK_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static struct wrqe * mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask) { struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int len; KASSERT(db_idx == 0 || db_idx == 1, ("%s: bad DDP buffer index %d", __func__, db_idx)); /* * We'll send a compound work request that has 3 SET_TCB_FIELDs and an * RX_DATA_ACK (with RX_MODULATE to speed up delivery). * * The work request header is 16B and always ends at a 16B boundary. * The ULPTX master commands that follow must all end at 16B boundaries * too so we round up the size to 16. */ len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + roundup2(LEN__RX_DATA_ACK_ULP, 16); wr = alloc_wrqe(len, toep->ctrlq); if (wr == NULL) return (NULL); wrh = wrtod(wr); INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* Write the buffer's tag */ ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF0_TAG + db_idx, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag)); /* Update the current offset in the DDP buffer and its total length */ if (db_idx == 0) ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF0_OFFSET, V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET(offset) | V_TCB_RX_DDP_BUF0_LEN(ps->len)); else ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), V_TCB_RX_DDP_BUF1_OFFSET(offset) | V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32)); /* Update DDP flags */ ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, ddp_flags_mask, ddp_flags); /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); return (wr); } static int handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) { uint32_t report = be32toh(ddp_report); unsigned int db_idx; struct inpcb *inp = toep->inp; struct ddp_buffer *db; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct kaiocb *job; long copied; db_idx = report & F_DDP_BUF_IDX ? 1 : 0; if (__predict_false(!(report & F_DDP_INV))) CXGBE_UNIMPLEMENTED("DDP buffer still valid"); INP_WLOCK(inp); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; DDP_LOCK(toep); KASSERT(toep->ddp.active_id == db_idx, ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx, toep->ddp.active_id, toep->tid)); db = &toep->ddp.db[db_idx]; job = db->job; if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { /* * This can happen due to an administrative tcpdrop(8). * Just fail the request with ECONNRESET. */ CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); if (aio_clear_cancel_function(job)) ddp_complete_one(job, ECONNRESET); goto completed; } tp = intotcpcb(inp); /* * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the * sequence number of the next byte to receive. The length of * the data received for this message must be computed by * comparing the new and old values of rcv_nxt. * * For RX_DATA_DDP, len might be non-zero, but it is only the * length of the most recent DMA. It does not include the * total length of the data received since the previous update * for this DDP buffer. rcv_nxt is the sequence number of the * first received byte from the most recent DMA. */ len += be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; tp->t_rcvtime = ticks; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; #endif #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__, toep->tid, db_idx, len, report); #endif /* receive buffer autosize */ MPASS(toep->vnet == so->so_vnet); CURVNET_SET(toep->vnet); SOCKBUF_LOCK(sb); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { struct adapter *sc = td_adapter(toep->td); unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; } SOCKBUF_UNLOCK(sb); CURVNET_RESTORE(); job->msgrcv = 1; if (db->cancel_pending) { /* * Update the job's length but defer completion to the * TCB_RPL callback. */ job->aio_received += len; goto out; } else if (!aio_clear_cancel_function(job)) { /* * Update the copied length for when * t4_aio_cancel_active() completes this request. */ job->aio_received += len; } else { copied = job->aio_received; #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %u, completing %p (copied %ld, placed %d)", __func__, toep->tid, job, copied, len); #endif aio_complete(job, copied + len, 0); t4_rcvd(&toep->td->tod, tp); } completed: complete_ddp_buffer(toep, db, db_idx); if (toep->ddp.waiting_count > 0) ddp_queue_toep(toep); out: DDP_UNLOCK(toep); INP_WUNLOCK(inp); return (0); } void handle_ddp_indicate(struct toepcb *toep) { DDP_ASSERT_LOCKED(toep); MPASS(toep->ddp.active_count == 0); MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0); if (toep->ddp.waiting_count == 0) { /* * The pending requests that triggered the request for an * an indicate were cancelled. Those cancels should have * already disabled DDP. Just ignore this as the data is * going into the socket buffer anyway. */ return; } CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__, toep->tid, toep->ddp.waiting_count); ddp_queue_toep(toep); } enum { DDP_BUF0_INVALIDATED = 0x2, DDP_BUF1_INVALIDATED }; CTASSERT(DDP_BUF0_INVALIDATED == CPL_COOKIE_DDP0); static int do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); unsigned int db_idx; struct toepcb *toep; struct inpcb *inp; struct ddp_buffer *db; struct kaiocb *job; long copied; if (cpl->status != CPL_ERR_NONE) panic("XXX: tcp_rpl failed: %d", cpl->status); toep = lookup_tid(sc, tid); inp = toep->inp; switch (cpl->cookie) { case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED): case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED): /* * XXX: This duplicates a lot of code with handle_ddp_data(). */ db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED; MPASS(db_idx < nitems(toep->ddp.db)); INP_WLOCK(inp); DDP_LOCK(toep); db = &toep->ddp.db[db_idx]; /* * handle_ddp_data() should leave the job around until * this callback runs once a cancel is pending. */ MPASS(db != NULL); MPASS(db->job != NULL); MPASS(db->cancel_pending); /* * XXX: It's not clear what happens if there is data * placed when the buffer is invalidated. I suspect we * need to read the TCB to see how much data was placed. * * For now this just pretends like nothing was placed. * * XXX: Note that if we did check the PCB we would need to * also take care of updating the tp, etc. */ job = db->job; copied = job->aio_received; if (copied == 0) { CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job); aio_cancel(job); } else { CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)", __func__, job, copied); aio_complete(job, copied, 0); t4_rcvd(&toep->td->tod, intotcpcb(inp)); } complete_ddp_buffer(toep, db, db_idx); if (toep->ddp.waiting_count > 0) ddp_queue_toep(toep); DDP_UNLOCK(toep); INP_WUNLOCK(inp); break; default: panic("XXX: unknown tcb_rpl offset %#x, cookie %#x", G_WORD(cpl->cookie), G_COOKIE(cpl->cookie)); } return (0); } void handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt) { struct ddp_buffer *db; struct kaiocb *job; long copied; unsigned int db_flag, db_idx; int len, placed; INP_WLOCK_ASSERT(toep->inp); DDP_ASSERT_LOCKED(toep); len = be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; while (toep->ddp.active_count > 0) { MPASS(toep->ddp.active_id != -1); db_idx = toep->ddp.active_id; db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; MPASS((toep->ddp.flags & db_flag) != 0); db = &toep->ddp.db[db_idx]; job = db->job; copied = job->aio_received; placed = len; if (placed > job->uaiocb.aio_nbytes - copied) placed = job->uaiocb.aio_nbytes - copied; if (placed > 0) job->msgrcv = 1; if (!aio_clear_cancel_function(job)) { /* * Update the copied length for when * t4_aio_cancel_active() completes this * request. */ job->aio_received += placed; } else { CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d", __func__, toep->tid, db_idx, placed); aio_complete(job, copied + placed, 0); } len -= placed; complete_ddp_buffer(toep, db, db_idx); } MPASS(len == 0); ddp_complete_all(toep, 0); } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) extern cpl_handler_t t4_cpl_handler[]; static int do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); uint32_t vld; struct toepcb *toep = lookup_tid(sc, tid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); vld = be32toh(cpl->ddpvld); if (__predict_false(vld & DDP_ERR)) { panic("%s: DDP error 0x%x (tid %d, toep %p)", __func__, vld, tid, toep); } if (ulp_mode(toep) == ULP_MODE_ISCSI) { t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m); return (0); } handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); return (0); } static int do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); return (0); } static void enable_ddp(struct adapter *sc, struct toepcb *toep) { KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, ("%s: toep %p has bad ddp_flags 0x%x", __func__, toep, toep->ddp.flags)); CTR3(KTR_CXGBE, "%s: tid %u (time %u)", __func__, toep->tid, time_uptime); DDP_ASSERT_LOCKED(toep); toep->ddp.flags |= DDP_SC_REQ; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0); } static int calculate_hcf(int n1, int n2) { int a, b, t; if (n1 <= n2) { a = n1; b = n2; } else { a = n2; b = n1; } while (a != 0) { t = a; a = b % a; b = t; } return (b); } static inline int pages_to_nppods(int npages, int ddp_page_shift) { MPASS(ddp_page_shift >= PAGE_SHIFT); return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES)); } static int alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx, struct ppod_reservation *prsv) { vmem_addr_t addr; /* relative to start of region */ if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT, &addr) != 0) return (ENOMEM); CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d", __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask, nppods, 1 << pr->pr_page_shift[pgsz_idx]); /* * The hardware tagmask includes an extra invalid bit but the arena was * seeded with valid values only. An allocation out of this arena will * fit inside the tagmask but won't have the invalid bit set. */ MPASS((addr & pr->pr_tag_mask) == addr); MPASS((addr & pr->pr_invalid_bit) == 0); prsv->prsv_pr = pr; prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr; prsv->prsv_nppods = nppods; return (0); } int t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps) { int i, hcf, seglen, idx, nppods; struct ppod_reservation *prsv = &ps->prsv; KASSERT(prsv->prsv_nppods == 0, ("%s: page pods already allocated", __func__)); /* * The DDP page size is unrelated to the VM page size. We combine * contiguous physical pages into larger segments to get the best DDP * page size possible. This is the largest of the four sizes in * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in * the page list. */ hcf = 0; for (i = 0; i < ps->npages; i++) { seglen = PAGE_SIZE; while (i < ps->npages - 1 && ps->pages[i]->phys_addr + PAGE_SIZE == ps->pages[i + 1]->phys_addr) { seglen += PAGE_SIZE; i++; } hcf = calculate_hcf(hcf, seglen); if (hcf < (1 << pr->pr_page_shift[1])) { idx = 0; goto have_pgsz; /* give up, short circuit */ } } #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { if ((hcf & PR_PAGE_MASK(idx)) == 0) break; } #undef PR_PAGE_MASK have_pgsz: MPASS(idx <= M_PPOD_PGSZ); nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]); if (alloc_page_pods(pr, nppods, idx, prsv) != 0) return (0); MPASS(prsv->prsv_nppods > 0); return (1); } int t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len, struct ppod_reservation *prsv) { int hcf, seglen, idx, npages, nppods; uintptr_t start_pva, end_pva, pva, p1; MPASS(buf > 0); MPASS(len > 0); /* * The DDP page size is unrelated to the VM page size. We combine * contiguous physical pages into larger segments to get the best DDP * page size possible. This is the largest of the four sizes in * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes * in the page list. */ hcf = 0; start_pva = trunc_page(buf); end_pva = trunc_page(buf + len - 1); pva = start_pva; while (pva <= end_pva) { seglen = PAGE_SIZE; p1 = pmap_kextract(pva); pva += PAGE_SIZE; while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) { seglen += PAGE_SIZE; pva += PAGE_SIZE; } hcf = calculate_hcf(hcf, seglen); if (hcf < (1 << pr->pr_page_shift[1])) { idx = 0; goto have_pgsz; /* give up, short circuit */ } } #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { if ((hcf & PR_PAGE_MASK(idx)) == 0) break; } #undef PR_PAGE_MASK have_pgsz: MPASS(idx <= M_PPOD_PGSZ); npages = 1; npages += (end_pva - start_pva) >> pr->pr_page_shift[idx]; nppods = howmany(npages, PPOD_PAGES); if (alloc_page_pods(pr, nppods, idx, prsv) != 0) return (ENOMEM); MPASS(prsv->prsv_nppods > 0); return (0); } void t4_free_page_pods(struct ppod_reservation *prsv) { struct ppod_region *pr = prsv->prsv_pr; vmem_addr_t addr; MPASS(prsv != NULL); MPASS(prsv->prsv_nppods != 0); addr = prsv->prsv_tag & pr->pr_tag_mask; MPASS((addr & pr->pr_invalid_bit) == 0); CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__, pr->pr_arena, addr, prsv->prsv_nppods); vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods)); prsv->prsv_nppods = 0; } #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) int t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid, struct pageset *ps) { struct wrqe *wr; struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz, idx; u_int ppod_addr; uint32_t cmd; struct ppod_reservation *prsv = &ps->prsv; struct ppod_region *pr = prsv->prsv_pr; KASSERT(!(ps->flags & PS_PPODS_WRITTEN), ("%s: page pods already written", __func__)); MPASS(prsv->prsv_nppods > 0); cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); wr = alloc_wrqe(len, wrq); if (wr == NULL) return (ENOMEM); /* ok to just bail out */ ulpmc = wrtod(wr); INIT_ULPTX_WR(ulpmc, len, 0, 0); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(tid) | prsv->prsv_tag); ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) | V_PPOD_OFST(ps->offset)); ppod->rsvd = 0; idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); for (k = 0; k < nitems(ppod->addr); k++) { if (idx < ps->npages) { ppod->addr[k] = htobe64(ps->pages[idx]->phys_addr); idx += ddp_pgsz / PAGE_SIZE; } else ppod->addr[k] = 0; #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, toep->tid, i, k, htobe64(ppod->addr[k])); #endif } } t4_wrq_tx(sc, wr); } ps->flags |= PS_PPODS_WRITTEN; return (0); } int t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid, struct ppod_reservation *prsv, vm_offset_t buf, int buflen) { struct wrqe *wr; struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz; u_int ppod_addr, offset; uint32_t cmd; struct ppod_region *pr = prsv->prsv_pr; uintptr_t end_pva, pva, pa; cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; offset = buf & PAGE_MASK; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); pva = trunc_page(buf); end_pva = trunc_page(buf + buflen - 1); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); MPASS(n > 0); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); wr = alloc_wrqe(len, wrq); if (wr == NULL) return (ENOMEM); /* ok to just bail out */ ulpmc = wrtod(wr); INIT_ULPTX_WR(ulpmc, len, 0, 0); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(tid) | (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ))); ppod->len_offset = htobe64(V_PPOD_LEN(buflen) | V_PPOD_OFST(offset)); ppod->rsvd = 0; for (k = 0; k < nitems(ppod->addr); k++) { if (pva > end_pva) ppod->addr[k] = 0; else { pa = pmap_kextract(pva); ppod->addr[k] = htobe64(pa); pva += ddp_pgsz; } #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, tid, i, k, htobe64(ppod->addr[k])); #endif } /* * Walk back 1 segment so that the first address in the * next pod is the same as the last one in the current * pod. */ pva -= ddp_pgsz; } t4_wrq_tx(sc, wr); } MPASS(pva <= end_pva); return (0); } /* * Prepare a pageset for DDP. This sets up page pods. */ static int prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps) { struct tom_data *td = sc->tom_softc; if (ps->prsv.prsv_nppods == 0 && !t4_alloc_page_pods_for_ps(&td->pr, ps)) { return (0); } if (!(ps->flags & PS_PPODS_WRITTEN) && t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) { return (0); } return (1); } int t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz, const char *name) { int i; MPASS(pr != NULL); MPASS(r->size > 0); pr->pr_start = r->start; pr->pr_len = r->size; pr->pr_page_shift[0] = 12 + G_HPZ0(psz); pr->pr_page_shift[1] = 12 + G_HPZ1(psz); pr->pr_page_shift[2] = 12 + G_HPZ2(psz); pr->pr_page_shift[3] = 12 + G_HPZ3(psz); /* The SGL -> page pod algorithm requires the sizes to be in order. */ for (i = 1; i < nitems(pr->pr_page_shift); i++) { if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1]) return (ENXIO); } pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG); pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask; if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0) return (ENXIO); pr->pr_alias_shift = fls(pr->pr_tag_mask); pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1); pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0, M_FIRSTFIT | M_NOWAIT); if (pr->pr_arena == NULL) return (ENOMEM); return (0); } void t4_free_ppod_region(struct ppod_region *pr) { MPASS(pr != NULL); if (pr->pr_arena) vmem_destroy(pr->pr_arena); bzero(pr, sizeof(*pr)); } static int pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages, int pgoff, int len) { if (ps->start != start || ps->npages != npages || ps->offset != pgoff || ps->len != len) return (1); return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp); } static int hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps) { struct vmspace *vm; vm_map_t map; vm_offset_t start, end, pgoff; struct pageset *ps; int n; DDP_ASSERT_LOCKED(toep); /* * The AIO subsystem will cancel and drain all requests before * permitting a process to exit or exec, so p_vmspace should * be stable here. */ vm = job->userproc->p_vmspace; map = &vm->vm_map; start = (uintptr_t)job->uaiocb.aio_buf; pgoff = start & PAGE_MASK; end = round_page(start + job->uaiocb.aio_nbytes); start = trunc_page(start); if (end - start > MAX_DDP_BUFFER_SIZE) { /* * Truncate the request to a short read. * Alternatively, we could DDP in chunks to the larger * buffer, but that would be quite a bit more work. * * When truncating, round the request down to avoid * crossing a cache line on the final transaction. */ end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE); #ifdef VERBOSE_TRACES CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu", __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes, (unsigned long)(end - (start + pgoff))); job->uaiocb.aio_nbytes = end - (start + pgoff); #endif end = round_page(end); } n = atop(end - start); /* * Try to reuse a cached pageset. */ TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) { if (pscmp(ps, vm, start, n, pgoff, job->uaiocb.aio_nbytes) == 0) { TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); toep->ddp.cached_count--; *pps = ps; return (0); } } /* * If there are too many cached pagesets to create a new one, * free a pageset before creating a new one. */ KASSERT(toep->ddp.active_count + toep->ddp.cached_count <= nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__)); if (toep->ddp.active_count + toep->ddp.cached_count == nitems(toep->ddp.db)) { KASSERT(toep->ddp.cached_count > 0, ("no cached pageset to free")); ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq); TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); toep->ddp.cached_count--; free_pageset(toep->td, ps); } DDP_UNLOCK(toep); /* Create a new pageset. */ ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | M_ZERO); ps->pages = (vm_page_t *)(ps + 1); ps->vm_timestamp = map->timestamp; ps->npages = vm_fault_quick_hold_pages(map, start, end - start, VM_PROT_WRITE, ps->pages, n); DDP_LOCK(toep); if (ps->npages < 0) { free(ps, M_CXGBE); return (EFAULT); } KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d", ps->npages, n)); ps->offset = pgoff; ps->len = job->uaiocb.aio_nbytes; - atomic_add_int(&vm->vm_refcnt, 1); + refcount_acquire(&vm->vm_refcnt); ps->vm = vm; ps->start = start; CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", __func__, toep->tid, ps, job, ps->npages); *pps = ps; return (0); } static void ddp_complete_all(struct toepcb *toep, int error) { struct kaiocb *job; DDP_ASSERT_LOCKED(toep); while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) { job = TAILQ_FIRST(&toep->ddp.aiojobq); TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count--; if (aio_clear_cancel_function(job)) ddp_complete_one(job, error); } } static void aio_ddp_cancel_one(struct kaiocb *job) { long copied; /* * If this job had copied data out of the socket buffer before * it was cancelled, report it as a short read rather than an * error. */ copied = job->aio_received; if (copied != 0) aio_complete(job, copied, 0); else aio_cancel(job); } /* * Called when the main loop wants to requeue a job to retry it later. * Deals with the race of the job being cancelled while it was being * examined. */ static void aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job) { DDP_ASSERT_LOCKED(toep); if (!(toep->ddp.flags & DDP_DEAD) && aio_set_cancel_function(job, t4_aio_cancel_queued)) { TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; } else aio_ddp_cancel_one(job); } static void aio_ddp_requeue(struct toepcb *toep) { struct adapter *sc = td_adapter(toep->td); struct socket *so; struct sockbuf *sb; struct inpcb *inp; struct kaiocb *job; struct ddp_buffer *db; size_t copied, offset, resid; struct pageset *ps; struct mbuf *m; uint64_t ddp_flags, ddp_flags_mask; struct wrqe *wr; int buf_flag, db_idx, error; DDP_ASSERT_LOCKED(toep); restart: if (toep->ddp.flags & DDP_DEAD) { MPASS(toep->ddp.waiting_count == 0); MPASS(toep->ddp.active_count == 0); return; } if (toep->ddp.waiting_count == 0 || toep->ddp.active_count == nitems(toep->ddp.db)) { return; } job = TAILQ_FIRST(&toep->ddp.aiojobq); so = job->fd_file->f_data; sb = &so->so_rcv; SOCKBUF_LOCK(sb); /* We will never get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { SOCKBUF_UNLOCK(sb); ddp_complete_all(toep, ENOTCONN); return; } KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0, ("%s: pending sockbuf data and DDP is active", __func__)); /* Abort if socket has reported problems. */ /* XXX: Wait for any queued DDP's to finish and/or flush them? */ if (so->so_error && sbavail(sb) == 0) { toep->ddp.waiting_count--; TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); if (!aio_clear_cancel_function(job)) { SOCKBUF_UNLOCK(sb); goto restart; } /* * If this job has previously copied some data, report * a short read and leave the error to be reported by * a future request. */ copied = job->aio_received; if (copied != 0) { SOCKBUF_UNLOCK(sb); aio_complete(job, copied, 0); goto restart; } error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); aio_complete(job, -1, error); goto restart; } /* * Door is closed. If there is pending data in the socket buffer, * deliver it. If there are pending DDP requests, wait for those * to complete. Once they have completed, return EOF reads. */ if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) { SOCKBUF_UNLOCK(sb); if (toep->ddp.active_count != 0) return; ddp_complete_all(toep, 0); return; } /* * If DDP is not enabled and there is no pending socket buffer * data, try to enable DDP. */ if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) { SOCKBUF_UNLOCK(sb); /* * Wait for the card to ACK that DDP is enabled before * queueing any buffers. Currently this waits for an * indicate to arrive. This could use a TCB_SET_FIELD_RPL * message to know that DDP was enabled instead of waiting * for the indicate which would avoid copying the indicate * if no data is pending. * * XXX: Might want to limit the indicate size to the size * of the first queued request. */ if ((toep->ddp.flags & DDP_SC_REQ) == 0) enable_ddp(sc, toep); return; } SOCKBUF_UNLOCK(sb); /* * If another thread is queueing a buffer for DDP, let it * drain any work and return. */ if (toep->ddp.queueing != NULL) return; /* Take the next job to prep it for DDP. */ toep->ddp.waiting_count--; TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); if (!aio_clear_cancel_function(job)) goto restart; toep->ddp.queueing = job; /* NB: This drops DDP_LOCK while it holds the backing VM pages. */ error = hold_aio(toep, job, &ps); if (error != 0) { ddp_complete_one(job, error); toep->ddp.queueing = NULL; goto restart; } SOCKBUF_LOCK(sb); if (so->so_error && sbavail(sb) == 0) { copied = job->aio_received; if (copied != 0) { SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_complete(job, copied, 0); toep->ddp.queueing = NULL; goto restart; } error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_complete(job, -1, error); toep->ddp.queueing = NULL; goto restart; } if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) { SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); if (toep->ddp.active_count != 0) { /* * The door is closed, but there are still pending * DDP buffers. Requeue. These jobs will all be * completed once those buffers drain. */ aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; return; } ddp_complete_one(job, 0); ddp_complete_all(toep, 0); toep->ddp.queueing = NULL; return; } sbcopy: /* * If the toep is dead, there shouldn't be any data in the socket * buffer, so the above case should have handled this. */ MPASS(!(toep->ddp.flags & DDP_DEAD)); /* * If there is pending data in the socket buffer (either * from before the requests were queued or a DDP indicate), * copy those mbufs out directly. */ copied = 0; offset = ps->offset + job->aio_received; MPASS(job->aio_received <= job->uaiocb.aio_nbytes); resid = job->uaiocb.aio_nbytes - job->aio_received; m = sb->sb_mb; KASSERT(m == NULL || toep->ddp.active_count == 0, ("%s: sockbuf data with active DDP", __func__)); while (m != NULL && resid > 0) { struct iovec iov[1]; struct uio uio; int error; iov[0].iov_base = mtod(m, void *); iov[0].iov_len = m->m_len; if (iov[0].iov_len > resid) iov[0].iov_len = resid; uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = iov[0].iov_len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; error = uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio); MPASS(error == 0 && uio.uio_resid == 0); copied += uio.uio_offset; resid -= uio.uio_offset; m = m->m_next; } if (copied != 0) { sbdrop_locked(sb, copied); job->aio_received += copied; job->msgrcv = 1; copied = job->aio_received; inp = sotoinpcb(so); if (!INP_TRY_WLOCK(inp)) { /* * The reference on the socket file descriptor in * the AIO job should keep 'sb' and 'inp' stable. * Our caller has a reference on the 'toep' that * keeps it stable. */ SOCKBUF_UNLOCK(sb); DDP_UNLOCK(toep); INP_WLOCK(inp); DDP_LOCK(toep); SOCKBUF_LOCK(sb); /* * If the socket has been closed, we should detect * that and complete this request if needed on * the next trip around the loop. */ } t4_rcvd_locked(&toep->td->tod, intotcpcb(inp)); INP_WUNLOCK(inp); if (resid == 0 || toep->ddp.flags & DDP_DEAD) { /* * We filled the entire buffer with socket * data, DDP is not being used, or the socket * is being shut down, so complete the * request. */ SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_complete(job, copied, 0); toep->ddp.queueing = NULL; goto restart; } /* * If DDP is not enabled, requeue this request and restart. * This will either enable DDP or wait for more data to * arrive on the socket buffer. */ if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) { SOCKBUF_UNLOCK(sb); recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; goto restart; } /* * An indicate might have arrived and been added to * the socket buffer while it was unlocked after the * copy to lock the INP. If so, restart the copy. */ if (sbavail(sb) != 0) goto sbcopy; } SOCKBUF_UNLOCK(sb); if (prep_pageset(sc, toep, ps) == 0) { recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; /* * XXX: Need to retry this later. Mostly need a trigger * when page pods are freed up. */ printf("%s: prep_pageset failed\n", __func__); return; } /* Determine which DDP buffer to use. */ if (toep->ddp.db[0].job == NULL) { db_idx = 0; } else { MPASS(toep->ddp.db[1].job == NULL); db_idx = 1; } ddp_flags = 0; ddp_flags_mask = 0; if (db_idx == 0) { ddp_flags |= V_TF_DDP_BUF0_VALID(1); if (so->so_state & SS_NBIO) ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) | V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1); buf_flag = DDP_BUF0_ACTIVE; } else { ddp_flags |= V_TF_DDP_BUF1_VALID(1); if (so->so_state & SS_NBIO) ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) | V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1); buf_flag = DDP_BUF1_ACTIVE; } MPASS((toep->ddp.flags & buf_flag) == 0); if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) { MPASS(db_idx == 0); MPASS(toep->ddp.active_id == -1); MPASS(toep->ddp.active_count == 0); ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1); } /* * The TID for this connection should still be valid. If DDP_DEAD * is set, SBS_CANTRCVMORE should be set, so we shouldn't be * this far anyway. Even if the socket is closing on the other * end, the AIO job holds a reference on this end of the socket * which will keep it open and keep the TCP PCB attached until * after the job is completed. */ wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received, ddp_flags, ddp_flags_mask); if (wr == NULL) { recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); toep->ddp.queueing = NULL; /* * XXX: Need a way to kick a retry here. * * XXX: We know the fixed size needed and could * preallocate this using a blocking request at the * start of the task to avoid having to handle this * edge case. */ printf("%s: mk_update_tcb_for_ddp failed\n", __func__); return; } if (!aio_set_cancel_function(job, t4_aio_cancel_active)) { free_wrqe(wr); recycle_pageset(toep, ps); aio_ddp_cancel_one(job); toep->ddp.queueing = NULL; goto restart; } #ifdef VERBOSE_TRACES CTR6(KTR_CXGBE, "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__, toep->tid, job, db_idx, ddp_flags, ddp_flags_mask); #endif /* Give the chip the go-ahead. */ t4_wrq_tx(sc, wr); db = &toep->ddp.db[db_idx]; db->cancel_pending = 0; db->job = job; db->ps = ps; toep->ddp.queueing = NULL; toep->ddp.flags |= buf_flag; toep->ddp.active_count++; if (toep->ddp.active_count == 1) { MPASS(toep->ddp.active_id == -1); toep->ddp.active_id = db_idx; CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__, toep->ddp.active_id); } goto restart; } void ddp_queue_toep(struct toepcb *toep) { DDP_ASSERT_LOCKED(toep); if (toep->ddp.flags & DDP_TASK_ACTIVE) return; toep->ddp.flags |= DDP_TASK_ACTIVE; hold_toepcb(toep); soaio_enqueue(&toep->ddp.requeue_task); } static void aio_ddp_requeue_task(void *context, int pending) { struct toepcb *toep = context; DDP_LOCK(toep); aio_ddp_requeue(toep); toep->ddp.flags &= ~DDP_TASK_ACTIVE; DDP_UNLOCK(toep); free_toepcb(toep); } static void t4_aio_cancel_active(struct kaiocb *job) { struct socket *so = job->fd_file->f_data; struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); uint64_t valid_flag; int i; DDP_LOCK(toep); if (aio_cancel_cleared(job)) { DDP_UNLOCK(toep); aio_ddp_cancel_one(job); return; } for (i = 0; i < nitems(toep->ddp.db); i++) { if (toep->ddp.db[i].job == job) { /* Should only ever get one cancel request for a job. */ MPASS(toep->ddp.db[i].cancel_pending == 0); /* * Invalidate this buffer. It will be * cancelled or partially completed once the * card ACKs the invalidate. */ valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) : V_TF_DDP_BUF1_VALID(1); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1, i + DDP_BUF0_INVALIDATED); toep->ddp.db[i].cancel_pending = 1; CTR2(KTR_CXGBE, "%s: request %p marked pending", __func__, job); break; } } DDP_UNLOCK(toep); } static void t4_aio_cancel_queued(struct kaiocb *job) { struct socket *so = job->fd_file->f_data; struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; DDP_LOCK(toep); if (!aio_cancel_cleared(job)) { TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count--; if (toep->ddp.waiting_count == 0) ddp_queue_toep(toep); } CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job); DDP_UNLOCK(toep); aio_ddp_cancel_one(job); } int t4_aio_queue_ddp(struct socket *so, struct kaiocb *job) { struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; /* Ignore writes. */ if (job->uaiocb.aio_lio_opcode != LIO_READ) return (EOPNOTSUPP); DDP_LOCK(toep); /* * XXX: Think about possibly returning errors for ENOTCONN, * etc. Perhaps the caller would only queue the request * if it failed with EOPNOTSUPP? */ #ifdef VERBOSE_TRACES CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid); #endif if (!aio_set_cancel_function(job, t4_aio_cancel_queued)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; toep->ddp.flags |= DDP_OK; /* * Try to handle this request synchronously. If this has * to block because the task is running, it will just bail * and let the task handle it instead. */ aio_ddp_requeue(toep); DDP_UNLOCK(toep); return (0); } void t4_ddp_mod_load(void) { t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl, CPL_COOKIE_DDP0); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl, CPL_COOKIE_DDP1); t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); TAILQ_INIT(&ddp_orphan_pagesets); mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF); TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL); } void t4_ddp_mod_unload(void) { taskqueue_drain(taskqueue_thread, &ddp_orphan_task); MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets)); mtx_destroy(&ddp_orphan_pagesets_lock); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1); t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL); t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL); } #endif Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 367333) +++ head/sys/kern/init_main.c (revision 367334) @@ -1,889 +1,889 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; int linux_alloc_current_noop(struct thread *td __unused, int flags __unused) { return (0); } int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef VERBOSE_SYSINIT /* * We'll use the defined value of VERBOSE_SYSINIT from the kernel config to * dictate the default VERBOSE_SYSINIT behavior. Significant values for this * option and associated tunable are: * - 0, 'compiled in but silent by default' * - 1, 'compiled in but verbose by default' (default) */ int verbose_sysinit = VERBOSE_SYSINIT; TUNABLE_INT("debug.verbose_sysinit", &verbose_sysinit); #endif #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif TSENTER(); if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last && verbose_sysinit != 0) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } TSEXIT(); /* Here so we don't overlap with start_init. */ mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_FOURTH, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_FOURTH, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; struct uidinfo tmpuinfo; struct loginclass tmplc = { .lc_name = "", }; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; p->p_treeflag |= P_TREE_REAPER; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ /* A hack to prevent uifind from tripping over NULL pointers. */ curthread->td_ucred = newcred; tmpuinfo.ui_uid = 1; newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_loginclass = &tmplc; newcred->cr_loginclass = loginclass_find("default"); /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; newcred->cr_users++; /* avoid assertion failure */ proc_set_cred_init(p, newcred); newcred->cr_users--; crfree(newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false, NULL); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_free_count()); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; - vmspace0.vm_refcnt = 1; + refcount_init(&vmspace0.vm_refcnt, 1); pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); EVENTHANDLER_DIRECT_INVOKE(thread_init, td); EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; PROC_STATUNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { struct image_args args; int error; char *var, *path; char *free_init_path, *tmp_init_path; struct thread *td; struct proc *p; struct vmspace *oldvmspace; TSENTER(); /* Here so we don't overlap with mi_startup. */ td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* For Multicons, report which console is primary to both */ if (boothowto & RB_MULTIPLE) { if (boothowto & RB_SERIAL) printf("Dual Console: Serial Primary, Video Secondary\n"); else printf("Dual Console: Video Primary, Serial Secondary\n"); } if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } free_init_path = tmp_init_path = strdup(init_path, M_TEMP); while ((path = strsep(&tmp_init_path, ":")) != NULL) { if (bootverbose) printf("start_init: trying %s\n", path); memset(&args, 0, sizeof(args)); error = exec_alloc_args(&args); if (error != 0) panic("%s: Can't allocate space for init arguments %d", __func__, error); error = exec_args_add_fname(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add fname %d", __func__, error); error = exec_args_add_arg(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); if (boothowto & RB_SINGLE) error = exec_args_add_arg(&args, "-s", UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ KASSERT((td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); oldvmspace = td->td_proc->p_vmspace; error = kern_execve(td, &args, NULL, oldvmspace); KASSERT(error != 0, ("kern_execve returned success, not EJUSTRETURN")); if (error == EJUSTRETURN) { exec_cleanup(td, oldvmspace); free(free_init_path, M_TEMP); TSEXIT(); return; } if (error != ENOENT) printf("exec %s: error %d\n", path, error); } free(free_init_path, M_TEMP); printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. We do this * early to reserve pid 1. Note special case - do not make it * runnable yet, init execution is started when userspace can be served. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crcowfree(td); td->td_realucred = crcowget(initproc->p_ucred); td->td_ucred = td->td_realucred; PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); /* * DDB(4). */ #ifdef DDB static void db_show_print_syinit(struct sysinit *sip, bool ddb) { const char *sname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (sip == NULL) { xprint("%s: no sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)sip, DB_STGY_ANY, &offset); db_symbol_values(sym, &sname, NULL); sym = db_search_symbol((vm_offset_t)sip->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (sname != NULL) ? sname : "", sip); xprint(" %#08x %#08x\n", sip->subsystem, sip->order); xprint(" %p(%s)(%p)\n", sip->func, (funcname != NULL) ? funcname : "", sip->udata); #undef xprint } DB_SHOW_COMMAND(sysinit, db_show_sysinit) { struct sysinit **sipp; db_printf("SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); for (sipp = sysinit; sipp < sysinit_end; sipp++) { db_show_print_syinit(*sipp, true); if (db_pager_quit) break; } } #endif /* DDB */ Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 367333) +++ head/sys/kern/kern_exec.c (revision 367334) @@ -1,1841 +1,1842 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "Location of process' ps_strings structure"); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "Top of process stack"); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", "Stack memory permissions"); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, "Process' command line characters cache limit"); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; }; #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL, oldvmspace); } post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } exec_cleanup(td, oldvmspace); } /* * kern_execve() has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, exec_args_get_begin_envv(args) - args->begin_argv); AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc, args->endp - exec_args_get_begin_envv(args)); return (do_execve(td, args, mac_p, oldvmspace)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; uintptr_t stack_base; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; int credential_changing; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif int error, i, orig_osrel; uint32_t orig_fctl0; static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; orig_osrel = p->p_osrel; orig_fctl0 = p->p_fctl0; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions. Also 'opens' file and sets its vnode to * text mode. */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; imgp->proc->p_fctl0 = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp); if (vn_fullpath(imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) error = ENOEXEC; goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * The text reference needs to be removed for scripts. * There is a short period before we determine that * something is a script where text reference is active. * The vnode lock is held over this entire period * so nothing should illegitimately be blocked. */ MPASS(imgp->textset); VOP_UNSET_TEXT_CHECKED(newtextvp); imgp->textset = false; /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base. */ error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Stack setup. */ error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0) p->p_flag2 &= ~P2_STKGAP_DISABLE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, stack_base); VOP_MMAPPED(imgp->vp); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (error != 0) { p->p_osrel = orig_osrel; p->p_fctl0 = orig_fctl0; } if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (imgp->textset) VOP_UNSET_TEXT_CHECKED(imgp->vp); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exec_cleanup(td, oldvmspace); exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } void exec_cleanup(struct thread *td, struct vmspace *oldvmspace) { if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(td->td_proc->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } int exec_map_first_page(struct image_params *imgp) { vm_object_t object; vm_page_t m; int error; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif error = vm_page_grab_valid_unlocked(&m, object, 0, VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) | VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (error != VM_PAGER_OK) return (EIO); imgp->firstpage = sf_buf_alloc(m, 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_unwire(m, PQ_ACTIVE); } } /* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; struct thread *td = curthread; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; sigfastblock_clear(td); /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); - if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && + if (refcount_load(&vmspace->vm_refcnt) == 1 && + vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE), ASLR state * must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } imgp->eff_stack_sz = lim_cur(curthread, RLIMIT_STACK); if (ssiz < imgp->eff_stack_sz) imgp->eff_stack_sz = ssiz; stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, const char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long arg, env; int error; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ error = exec_args_add_fname(args, fname, segflg); if (error != 0) goto err_exit; /* * extract arguments first */ for (;;) { error = fueword(argv++, &arg); if (error == -1) { error = EFAULT; goto err_exit; } if (arg == 0) break; error = exec_args_add_arg(args, (char *)(uintptr_t)arg, UIO_USERSPACE); if (error != 0) goto err_exit; } /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &env); if (error == -1) { error = EFAULT; goto err_exit; } if (env == 0) break; error = exec_args_add_env(args, (char *)(uintptr_t)env, UIO_USERSPACE); if (error != 0) goto err_exit; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen >= ofdp->fd_nfiles) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; if (argkva->gen != gen) { (void)vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * A set to functions to fill struct image args. * * NOTE: exec_args_add_fname() must be called (possibly with a NULL * fname) before the other functions. All exec_args_add_arg() calls must * be made before any exec_args_add_env() calls. exec_args_adjust_args() * may be called any time after exec_args_add_fname(). * * exec_args_add_fname() - install path to be executed * exec_args_add_arg() - append an argument string * exec_args_add_env() - append an env string * exec_args_adjust_args() - adjust location of the argument list to * allow new arguments to be prepended */ int exec_args_add_fname(struct image_args *args, const char *fname, enum uio_seg segflg) { int error; size_t length; KASSERT(args->fname == NULL, ("fname already appended")); KASSERT(args->endp == NULL, ("already appending to args")); if (fname != NULL) { args->fname = args->buf; error = segflg == UIO_SYSSPACE ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); } else length = 0; /* Set up for _arg_*()/_env_*() */ args->endp = args->buf + length; /* begin_argv must be set and kept updated */ args->begin_argv = args->endp; KASSERT(exec_map_entry_size - length >= ARG_MAX, ("too little space remaining for arguments %zu < %zu", exec_map_entry_size - length, (size_t)ARG_MAX)); args->stringspace = ARG_MAX; return (0); } static int exec_args_add_str(struct image_args *args, const char *str, enum uio_seg segflg, int *countp) { int error; size_t length; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); error = (segflg == UIO_SYSSPACE) ? copystr(str, args->endp, args->stringspace, &length) : copyinstr(str, args->endp, args->stringspace, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); args->stringspace -= length; args->endp += length; (*countp)++; return (0); } int exec_args_add_arg(struct image_args *args, const char *argp, enum uio_seg segflg) { KASSERT(args->envc == 0, ("appending args after env")); return (exec_args_add_str(args, argp, segflg, &args->argc)); } int exec_args_add_env(struct image_args *args, const char *envp, enum uio_seg segflg) { if (args->envc == 0) args->begin_envv = args->endp; return (exec_args_add_str(args, envp, segflg, &args->envc)); } int exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend) { ssize_t offset; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); offset = extend - consume; if (args->stringspace < offset) return (E2BIG); memmove(args->begin_argv + extend, args->begin_argv + consume, args->endp - args->begin_argv + consume); if (args->envc > 0) args->begin_envv += offset; args->endp += offset; args->stringspace -= offset; return (0); } char * exec_args_get_begin_envv(struct image_args *args) { KASSERT(args->endp != NULL, ("endp not initialized")); if (args->envc > 0) return (args->begin_envv); return (args->endp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ int exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int error, szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; imgp->ps_strings = arginfo; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); error = copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); if (error != 0) return (error); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = (void *)destp; error = copyout(pagesizes, imgp->pagesizes, szps); if (error != 0) return (error); imgp->pagesizeslen = szps; /* * Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->sysent->sv_stackgap != NULL) imgp->sysent->sv_stackgap(imgp, &destp); if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to AT_COUNT entries. */ destp -= AT_COUNT * sizeof(Elf_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * vectp also becomes our initial stack base */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* * Fill in "ps_strings" struct for ps, w, etc. */ imgp->argv = vectp; if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* a null vector table pointer separates the argp's from the envp's */ if (suword(vectp++, 0) != 0) return (EFAULT); imgp->envv = vectp; if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* end of vector table is a null pointer */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. * * Add a text reference now so no one can write to the * executable while we're activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ error = VOP_SET_TEXT(vp); if (error != 0) return (error); imgp->textset = true; /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; u_int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 367333) +++ head/sys/kern/vfs_aio.c (revision 367334) @@ -1,2986 +1,2987 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Async IO management"); static int enable_aio_unsafe = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, "Permit asynchronous IO on all file types, not just known-safe types"); static unsigned int unsafe_warningcnt = 1; SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, &unsafe_warningcnt, 0, "Warnings that will be triggered upon failed IO requests on unsafe files"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel processes to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel processes for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel processes for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); static int num_unmapped_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, 0, "Number of aio requests presently handled by unmapped I/O buffers"); /* Number of async I/O processes in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process"); /* * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with * vfs.aio.aio_listio_max. */ SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 0, "Maximum aio requests for a single lio_listio call"); #ifdef COMPAT_FREEBSD6 typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; #endif /* * Below is a key of locks used to protect each member of struct kaiocb * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * If the routine that services an AIO request blocks while running in an * AIO kernel process it can starve other I/O requests. BIO requests * queued via aio_qbio() complete asynchronously and do not use AIO kernel * processes at all. Socket I/O requests use a separate pool of * kprocs and also force non-blocking I/O. Other file I/O requests * use the generic fo_read/fo_write operations which can block. The * fsync and mlock operations can also block while executing. Ideally * none of these requests would block while executing. * * Note that the service routines cannot toggle O_NONBLOCK in the file * structure directly while handling a request due to races with * userland threads. */ /* jobflags */ #define KAIOCB_QUEUEING 0x01 #define KAIOCB_CANCELLED 0x02 #define KAIOCB_CANCELLING 0x04 #define KAIOCB_CHECKSYNC 0x08 #define KAIOCB_CLEARED 0x10 #define KAIOCB_FINISHED 0x20 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aioproc { int aioprocflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ struct proc *aioproc; /* (*) the AIO proc */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_count; /* (a) size of AIO queue */ int kaio_buffer_count; /* (a) number of bio buffers */ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ struct task kaio_task; /* (*) task to kick aio processes */ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*aio_copyin)(struct aiocb *ujob, struct aiocb *kjob); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_biowakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qbio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); static bool aio_clear_cancel_function_locked(struct kaiocb *job); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io process data * aiocb async io jobs * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; static struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_kick); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); ki->kaio_flags = 0; ki->kaio_active_count = 0; ki->kaio_count = 0; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_syncqueue); TAILQ_INIT(&ki->kaio_syncready); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct kaiocb *job) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = job->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(job->jobflags & KAIOCB_FINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, job, plist); TAILQ_REMOVE(&ki->kaio_all, job, allist); lj = job->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* job is going away, we need to destroy any knotes */ knlist_delete(&job->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&job->ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * a kaiocb from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } static int aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) { aio_cancel_fn_t *func; int cancelled; AIO_LOCK_ASSERT(ki, MA_OWNED); if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) return (0); MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); job->jobflags |= KAIOCB_CANCELLED; func = job->cancel_fn; /* * If there is no cancel routine, just leave the job marked as * cancelled. The job should be in active use by a caller who * should complete it normally or when it fails to install a * cancel routine. */ if (func == NULL) return (0); /* * Set the CANCELLING flag so that aio_complete() will defer * completions of this job. This prevents the job from being * freed out from under the cancel callback. After the * callback any deferred completion (whether from the callback * or any other source) will be completed. */ job->jobflags |= KAIOCB_CANCELLING; AIO_UNLOCK(ki); func(job); AIO_LOCK(ki); job->jobflags &= ~KAIOCB_CANCELLING; if (job->jobflags & KAIOCB_FINISHED) { cancelled = job->uaiocb._aiocb_private.error == ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(p, job); } else { /* * The cancel callback might have scheduled an * operation to cancel this request, but it is * only counted as cancelled if the request is * cancelled when the callback returns. */ cancelled = 0; } return (cancelled); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct kaiocb *job, *jobn; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { aio_cancel_job(p, ki, job); } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(job); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct kaiocb * aio_selectjob(struct aioproc *aiop) { struct kaiocb *job; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); restart: TAILQ_FOREACH(job, &aio_jobs, list) { userp = job->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < max_aio_per_proc) { TAILQ_REMOVE(&aio_jobs, job, list); if (!aio_clear_cancel_function(job)) goto restart; /* Account for currently active jobs. */ ki->kaio_active_count++; break; } } return (job); } /* * Move all data to a permanent storage device. This code * simulates the fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp); vn_finished_write(mp); drop: return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-bio version of the operations. The normal * vn operations are used, and this code should work in all instances for every * type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; struct uio auio; struct iovec aiov; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || job->uaiocb.aio_lio_opcode == LIO_WRITE, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; cb = &job->uaiocb; fp = job->fd_file; aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; if (auio.uio_resid == 0) error = 0; else error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; job->msgrcv = msgrcv_end - msgrcv_st; job->msgsnd = msgsnd_end - msgsnd_st; job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } cnt -= auio.uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, cnt, 0); } static void aio_process_sync(struct kaiocb *job) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct file *fp = job->fd_file; int error = 0; KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); td->td_ucred = job->cred; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, 0, 0); } static void aio_process_mlock(struct kaiocb *job) { struct aiocb *cb = &job->uaiocb; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); error = kern_mlock(job->userproc, job->cred, __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); aio_complete(job, error != 0 ? -1 : 0, error); } static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *sjob, *sjobn; int lj_done; bool schedule_fsync; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = job->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); MPASS(job->jobflags & KAIOCB_FINISHED); if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi); KNOTE_LOCKED(&job->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (job->jobflags & KAIOCB_CHECKSYNC) { schedule_fsync = false; TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { if (job->fd_file != sjob->fd_file || job->seqno >= sjob->seqno) continue; if (--sjob->pending > 0) continue; TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); if (!aio_clear_cancel_function_locked(sjob)) continue; TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); schedule_fsync = true; } if (schedule_fsync) taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_sync_task); } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } static void aio_schedule_fsync(void *context, int pending) { struct kaioinfo *ki; struct kaiocb *job; ki = context; AIO_LOCK(ki); while (!TAILQ_EMPTY(&ki->kaio_syncready)) { job = TAILQ_FIRST(&ki->kaio_syncready); TAILQ_REMOVE(&ki->kaio_syncready, job, list); AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); AIO_LOCK(ki); } AIO_UNLOCK(ki); } bool aio_cancel_cleared(struct kaiocb *job) { /* * The caller should hold the same queue lock held when * aio_clear_cancel_function() was called and set this flag * ensuring this check sees an up-to-date value. However, * there is no way to assert that. */ return ((job->jobflags & KAIOCB_CLEARED) != 0); } static bool aio_clear_cancel_function_locked(struct kaiocb *job) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); MPASS(job->cancel_fn != NULL); if (job->jobflags & KAIOCB_CANCELLING) { job->jobflags |= KAIOCB_CLEARED; return (false); } job->cancel_fn = NULL; return (true); } bool aio_clear_cancel_function(struct kaiocb *job) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_clear_cancel_function_locked(job); AIO_UNLOCK(ki); return (ret); } static bool aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); if (job->jobflags & KAIOCB_CANCELLED) return (false); job->cancel_fn = func; return (true); } bool aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_set_cancel_function_locked(job, func); AIO_UNLOCK(ki); return (ret); } void aio_complete(struct kaiocb *job, long status, int error) { struct kaioinfo *ki; struct proc *userp; job->uaiocb._aiocb_private.error = error; job->uaiocb._aiocb_private.status = status; userp = job->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); KASSERT(!(job->jobflags & KAIOCB_FINISHED), ("duplicate aio_complete")); job->jobflags |= KAIOCB_FINISHED; if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(userp, job); } AIO_UNLOCK(ki); } void aio_cancel(struct kaiocb *job) { aio_complete(job, -1, ECANCELED); } void aio_switch_vmspace(struct kaiocb *job) { vmspace_switch_aio(job->userproc->p_vmspace); } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct kaiocb *job; struct aioproc *aiop; struct kaioinfo *ki; struct proc *p; struct vmspace *myvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = td->td_proc; myvm = vmspace_acquire_ref(p); KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aioproc = p; aiop->aioprocflags = 0; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((job = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); ki = job->userproc->p_aioinfo; job->handle_fn(job); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; } /* * Disconnect from user address space. */ if (p->p_vmspace != myvm) { mtx_unlock(&aio_job_mtx); vmspace_switch_aio(myvm); mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && (aiop->aioprocflags & AIOP_FREE) && num_aio_procs > target_aio_procs) break; } TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); vmspace_free(myvm); KASSERT(p->p_vmspace == myvm, ("AIOD: bad vmspace for exiting daemon")); - KASSERT(myvm->vm_refcnt > 1, - ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt)); + KASSERT(refcount_load(&myvm->vm_refcnt) > 1, + ("AIOD: bad vm refcnt for exiting daemon: %d", + refcount_load(&myvm->vm_refcnt))); kproc_exit(0); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead bio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qbio(struct proc *p, struct kaiocb *job) { struct aiocb *cb; struct file *fp; struct bio *bp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; int error, ref, poff; vm_prot_t prot; cb = &job->uaiocb; fp = job->fd_file; if (!(cb->aio_lio_opcode == LIO_WRITE || cb->aio_lio_opcode == LIO_READ)) return (-1); if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (cb->aio_nbytes > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { if (cb->aio_nbytes > MAXPHYS) { error = -1; goto unref; } pbuf = NULL; } else { if (cb->aio_nbytes > MAXPHYS - poff) { error = -1; goto unref; } if (ki->kaio_buffer_count >= max_buf_aio) { error = EAGAIN; goto unref; } job->pbuf = pbuf = uma_zalloc(pbuf_zone, M_WAITOK); BUF_KERNPROC(pbuf); AIO_LOCK(ki); ki->kaio_buffer_count++; AIO_UNLOCK(ki); } job->bp = bp = g_alloc_bio(); bp->bio_length = cb->aio_nbytes; bp->bio_bcount = cb->aio_nbytes; bp->bio_done = aio_biowakeup; bp->bio_offset = cb->aio_offset; bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; bp->bio_dev = dev; bp->bio_caller1 = (void *)job; prot = VM_PROT_READ; if (cb->aio_lio_opcode == LIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)cb->aio_buf, bp->bio_length, prot, job->pages, nitems(job->pages)); if (job->npages < 0) { error = EFAULT; goto doerror; } if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, job->pages, job->npages); bp->bio_data = pbuf->b_data + poff; atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = job->pages; bp->bio_ma_n = job->npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; atomic_add_int(&num_unmapped_aio, 1); } /* Perform transfer. */ csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: if (pbuf != NULL) { AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); uma_zfree(pbuf_zone, pbuf); job->pbuf = NULL; } g_destroy_bio(bp); job->bp = NULL; unref: dev_relthread(dev, ref); return (error); } #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb *ojob; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, kjob, sizeof(struct oaiocb)); if (error) return (error); ojob = (struct oaiocb *)kjob; return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) { return (copyin(ujob, kjob, sizeof(struct aiocb))); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .aio_copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb_ops_osigevent = { .aio_copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #endif /* * Queue a new AIO request. Choosing either the threaded or direct bio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct file *fp; struct kaiocb *job; struct kaioinfo *ki; struct kevent kev; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(ujob, -1); ops->store_error(ujob, 0); ops->store_kernelinfo(ujob, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= max_aio_queue_per_proc) { ops->store_error(ujob, EAGAIN); return (EAGAIN); } job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&job->klist, AIO_MTX(ki)); error = ops->aio_copyin(ujob, &job->uaiocb); if (error) { ops->store_error(ujob, error); uma_zfree(aiocb_zone, job); return (error); } if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { uma_zfree(aiocb_zone, job); return (EINVAL); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { ops->store_error(ujob, EINVAL); uma_zfree(aiocb_zone, job); return (EINVAL); } if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, job); return (EINVAL); } ksiginfo_init(&job->ksi); /* Save userspace address of the job info. */ job->ujob = ujob; /* Get the opcode. */ if (type != LIO_NOP) job->uaiocb.aio_lio_opcode = type; opcode = job->uaiocb.aio_lio_opcode; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: error = fget_write(td, fd, &cap_pwrite_rights, &fp); break; case LIO_READ: error = fget_read(td, fd, &cap_pread_rights, &fp); break; case LIO_SYNC: error = fget(td, fd, &cap_fsync_rights, &fp); break; case LIO_MLOCK: fp = NULL; break; case LIO_NOP: error = fget(td, fd, &cap_no_rights, &fp); break; default: error = EINVAL; } if (error) { uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if ((opcode == LIO_READ || opcode == LIO_WRITE) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; goto aqueue_fail; } job->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; job->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(ujob, jid); if (error) { error = EINVAL; goto aqueue_fail; } job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, job); return (0); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto aqueue_fail; } kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; memset(&kev, 0, sizeof(kev)); kev.ident = (uintptr_t)job->ujob; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)job; kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, M_WAITOK); if (error) goto aqueue_fail; no_kqueue: ops->store_error(ujob, EINPROGRESS); job->uaiocb._aiocb_private.error = EINPROGRESS; job->userproc = p; job->cred = crhold(td->td_ucred); job->jobflags = KAIOCB_QUEUEING; job->lio = lj; if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; } else if (fp->f_ops->fo_aio_queue == NULL) error = aio_queue_file(fp, job); else error = fo_aio_queue(fp, job); if (error) goto aqueue_fail; AIO_LOCK(ki); job->jobflags &= ~KAIOCB_QUEUEING; TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); ki->kaio_count++; if (lj) lj->lioj_count++; atomic_add_int(&num_queue_count, 1); if (job->jobflags & KAIOCB_FINISHED) { /* * The queue callback completed the request synchronously. * The bulk of the completion is deferred in that case * until this point. */ aio_bio_done_notify(p, job); } else TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); AIO_UNLOCK(ki); return (0); aqueue_fail: knlist_delete(&job->klist, curthread, 0); if (fp) fdrop(fp, td); uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } static void aio_cancel_daemon_job(struct kaiocb *job) { mtx_lock(&aio_job_mtx); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&aio_jobs, job, list); mtx_unlock(&aio_job_mtx); aio_cancel(job); } void aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) { mtx_lock(&aio_job_mtx); if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { mtx_unlock(&aio_job_mtx); aio_cancel(job); return; } job->handle_fn = func; TAILQ_INSERT_TAIL(&aio_jobs, job, list); aio_kick_nowait(job->userproc); mtx_unlock(&aio_job_mtx); } static void aio_cancel_sync(struct kaiocb *job) { struct kaioinfo *ki; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); aio_cancel(job); } int aio_queue_file(struct file *fp, struct kaiocb *job) { struct kaioinfo *ki; struct kaiocb *job2; struct vnode *vp; struct mount *mp; int error; bool safe; ki = job->userproc->p_aioinfo; error = aio_qbio(job->userproc, job); if (error >= 0) return (error); safe = false; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VREG || vp->v_type == VDIR) { mp = fp->f_vnode->v_mount; if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) safe = true; } } if (!(safe || enable_aio_unsafe)) { counted_warning(&unsafe_warningcnt, "is attempting to use unsafe AIO requests"); return (EOPNOTSUPP); } switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: case LIO_WRITE: aio_schedule(job, aio_process_rw); error = 0; break; case LIO_SYNC: AIO_LOCK(ki); TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { if (job2->fd_file == job->fd_file && job2->uaiocb.aio_lio_opcode != LIO_SYNC && job2->seqno < job->seqno) { job2->jobflags |= KAIOCB_CHECKSYNC; job->pending++; } } if (job->pending != 0) { if (!aio_set_cancel_function_locked(job, aio_cancel_sync)) { AIO_UNLOCK(ki); aio_cancel(job); return (0); } TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); return (0); } AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); error = 0; break; default: error = EINVAL; } return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; long status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_done, plist) { if (job->ujob == ujob) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_error(ujob, error); ops->store_status(ujob, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *firstjob, *job; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { firstjob = NULL; error = 0; TAILQ_FOREACH(job, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (job->ujob == ujoblist[i]) { if (firstjob == NULL) firstjob = job; if (job->jobflags & KAIOCB_FINISHED) goto RETURN; } } } /* All tasks were finished. */ if (firstjob == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); free(ujoblist, M_AIOS); return (error); } /* * aio_cancel cancels any non-bio aio operations not currently in progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct kaiocb *job, *jobn; struct file *fp; int error; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, &cap_no_rights, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { if ((uap->fd == job->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == job->ujob))) { if (aio_cancel_job(p, ki, job)) { cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_all, allist) { if (job->ujob == ujob) { if (job->jobflags & KAIOCB_FINISHED) td->td_retval[0] = job->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(ujob); if (status == -1) { td->td_retval[0] = ops->fetch_error(ujob); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } #endif int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } #endif int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *job; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nagain, nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ memset(&kev, 0, sizeof(kev)); kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, M_WAITOK); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nagain = 0; nerror = 0; for (i = 0; i < nent; i++) { job = acb_list[i]; if (job != NULL) { error = aio_aqueue(td, job, lj, LIO_NOP, ops); if (error == EAGAIN) nagain++; else if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); else if (nagain) return (EAGAIN); else return (error); } /* syscall - list directed I/O (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_biowakeup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct proc *userp; struct kaioinfo *ki; size_t nbytes; int error, nblks; /* Release mapping into kernel space. */ userp = job->userproc; ki = userp->p_aioinfo; if (job->pbuf) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); uma_zfree(pbuf_zone, job->pbuf); job->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); } else atomic_subtract_int(&num_unmapped_aio, 1); vm_page_unhold_pages(job->pages, job->npages); bp = job->bp; job->bp = NULL; nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; error = 0; if (bp->bio_flags & BIO_ERROR) error = bp->bio_error; nblks = btodb(nbytes); if (job->uaiocb.aio_lio_opcode == LIO_WRITE) job->outblock += nblks; else job->inblock += nblks; if (error) aio_complete(job, -1, error); else aio_complete(job, nbytes, 0); g_destroy_bio(bp); } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; long error, status; int timo; ops->store_aiocb(ujobp, NULL); if (ts == NULL) { timo = 0; } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timo = -1; } else { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; job = NULL; AIO_LOCK(ki); while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { if (timo == -1) { error = EWOULDBLOCK; break; } ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); ujob = job->ujob; status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_aiocb(ujobp, ujob); ops->store_error(ujob, error); ops->store_status(ujob, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, struct aiocb_ops *ops) { if (op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct kaiocb *job; job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; /* * The job pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = job; kn->kn_flags &= ~EV_FLAG1; knlist_add(&job->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct kaiocb *job = kn->kn_ptr.p_aio; kn->kn_data = job->uaiocb._aiocb_private.error; if (!(job->jobflags & KAIOCB_FINISHED)) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob *lj; lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; #ifdef COMPAT_FREEBSD6 typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; #endif typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb32 job32; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) { struct aiocb32 job32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .aio_copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb32_ops_osigevent = { .aio_copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #endif int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent - 1; i >= 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } free(ujoblist, M_AIOS); return (error); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_read(struct thread *td, struct freebsd6_freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, struct freebsd6_freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_lio_listio(struct thread *td, struct freebsd6_freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c (revision 367333) +++ head/sys/vm/vm_glue.c (revision 367334) @@ -1,603 +1,604 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(void *addr, int len, int rw) { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); if ((vm_offset_t)addr + len > vm_map_max(kernel_map) || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjunction with this call. */ int useracc(void *addr, int len, int rw) { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_user_wired) return (ENOMEM); error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (error == KERN_SUCCESS) { curthread->td_vslock_sz += len; return (0); } /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ MPASS(curthread->td_vslock_sz >= len); curthread->td_vslock_sz -= len; (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Pin the page contained within the given object at the given offset. If the * page is not resident, allocate and load it using the given object's pager. * Return the pinned page if successful; otherwise, return NULL. */ static vm_page_t vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; vm_pindex_t pindex; pindex = OFF_TO_IDX(offset); (void)vm_page_grab_valid_unlocked(&m, object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); return (m); } /* * Return a CPU private mapping to the page at the given offset within the * given object. The page is pinned before it is mapped. */ struct sf_buf * vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; m = vm_imgact_hold_page(object, offset); if (m == NULL) return (NULL); sched_pin(); return (sf_buf_alloc(m, SFB_CPUPRIVATE)); } /* * Destroy the given CPU private mapping and unpin the page that it mapped. */ void vm_imgact_unmap_page(struct sf_buf *sf) { vm_page_t m; m = sf_buf_page(sf); sf_buf_free(sf); sched_unpin(); vm_page_unwire(m, PQ_ACTIVE); } void vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz) { pmap_sync_icache(map->pmap, va, sz); } vm_object_t kstack_object; static uma_zone_t kstack_cache; static int kstack_cache_size; static int sysctl_kstack_cache_size(SYSCTL_HANDLER_ARGS) { int error, oldsize; oldsize = kstack_cache_size; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0 && req->newptr && oldsize != kstack_cache_size) uma_zone_set_maxcache(kstack_cache, kstack_cache_size); return (error); } SYSCTL_PROC(_vm, OID_AUTO, kstack_cache_size, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &kstack_cache_size, 0, sysctl_kstack_cache_size, "IU", "Maximum number of cached kernel stacks"); /* * Create the kernel stack (including pcb for i386) for a new thread. */ static vm_offset_t vm_thread_stack_create(struct domainset *ds, int pages) { vm_page_t ma[KSTACK_MAX_PAGES]; vm_offset_t ks; int i; /* * Get a kernel virtual address for this thread's kstack. */ #if defined(__mips__) /* * We need to align the kstack's mapped address to fit within * a single TLB entry. */ if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_BESTFIT | M_NOWAIT, &ks)) { ks = 0; } #else ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); #endif if (ks == 0) { printf("%s: kstack allocation failed\n", __func__); return (0); } if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } /* * Allocate physical pages to back the stack. */ vm_thread_stack_back(ds, ks, ma, pages, VM_ALLOC_NORMAL); for (i = 0; i < pages; i++) vm_page_valid(ma[i]); pmap_qenter(ks, ma, pages); return (ks); } static void vm_thread_stack_dispose(vm_offset_t ks, int pages) { vm_page_t m; vm_pindex_t pindex; int i; pindex = atop(ks - VM_MIN_KERNEL_ADDRESS); pmap_qremove(ks, pages); VM_OBJECT_WLOCK(kstack_object); for (i = 0; i < pages; i++) { m = vm_page_lookup(kstack_object, pindex + i); if (m == NULL) panic("%s: kstack already missing?", __func__); vm_page_xbusy_claim(m); vm_page_unwire_noq(m); vm_page_free(m); } VM_OBJECT_WUNLOCK(kstack_object); kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Allocate the kernel stack for a new thread. */ int vm_thread_new(struct thread *td, int pages) { vm_offset_t ks; /* Bounds check */ if (pages <= 1) pages = kstack_pages; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; ks = 0; if (pages == kstack_pages && kstack_cache != NULL) ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT); /* * Ensure that kstack objects can draw pages from any memory * domain. Otherwise a local memory shortage can block a process * swap-in. */ if (ks == 0) ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)), pages); if (ks == 0) return (0); td->td_kstack = ks; td->td_kstack_pages = pages; return (1); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_offset_t ks; int pages; pages = td->td_kstack_pages; ks = td->td_kstack; td->td_kstack = 0; td->td_kstack_pages = 0; if (pages == kstack_pages) uma_zfree(kstack_cache, (void *)ks); else vm_thread_stack_dispose(ks, pages); } /* * Allocate physical pages, following the specified NUMA policy, to back a * kernel stack. */ void vm_thread_stack_back(struct domainset *ds, vm_offset_t ks, vm_page_t ma[], int npages, int req_class) { vm_pindex_t pindex; int n; pindex = atop(ks - VM_MIN_KERNEL_ADDRESS); VM_OBJECT_WLOCK(kstack_object); for (n = 0; n < npages;) { if (vm_ndomains > 1) kstack_object->domain.dr_policy = ds; /* * Use WAITFAIL to force a reset of the domain selection policy * if we had to sleep for pages. */ n += vm_page_grab_pages(kstack_object, pindex + n, req_class | VM_ALLOC_WIRED | VM_ALLOC_WAITFAIL, &ma[n], npages - n); } VM_OBJECT_WUNLOCK(kstack_object); } static int kstack_import(void *arg, void **store, int cnt, int domain, int flags) { struct domainset *ds; int i; if (domain == UMA_ANYDOMAIN) ds = DOMAINSET_RR(); else ds = DOMAINSET_PREF(domain); for (i = 0; i < cnt; i++) { store[i] = (void *)vm_thread_stack_create(ds, kstack_pages); if (store[i] == NULL) break; } return (i); } static void kstack_release(void *arg, void **store, int cnt) { vm_offset_t ks; int i; for (i = 0; i < cnt; i++) { ks = (vm_offset_t)store[i]; vm_thread_stack_dispose(ks, kstack_pages); } } static void kstack_cache_init(void *null) { kstack_object = vm_object_allocate(OBJT_SWAP, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS)); kstack_cache = uma_zcache_create("kstack_cache", kstack_pages * PAGE_SIZE, NULL, NULL, NULL, NULL, kstack_import, kstack_release, NULL, UMA_ZONE_FIRSTTOUCH); kstack_cache_size = imax(128, mp_ncpus * 4); uma_zone_set_maxcache(kstack_cache, kstack_cache_size); } SYSINIT(vm_kstacks, SI_SUB_KMEM, SI_ORDER_ANY, kstack_cache_init, NULL); #ifdef KSTACK_USAGE_PROF /* * Track maximum stack used by a thread in kernel. */ static int max_kstack_used; SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD, &max_kstack_used, 0, "Maxiumum stack depth used by a thread in kernel"); void intr_prof_stack_use(struct thread *td, struct trapframe *frame) { vm_offset_t stack_top; vm_offset_t current; int used, prev_used; /* * Testing for interrupted kernel mode isn't strictly * needed. It optimizes the execution, since interrupts from * usermode will have only the trap frame on the stack. */ if (TRAPF_USERMODE(frame)) return; stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE; current = (vm_offset_t)(uintptr_t)&stack_top; /* * Try to detect if interrupt is using kernel thread stack. * Hardware could use a dedicated stack for interrupt handling. */ if (stack_top <= current || current < td->td_kstack) return; used = stack_top - current; for (;;) { prev_used = max_kstack_used; if (prev_used >= used) break; if (atomic_cmpset_int(&max_kstack_used, prev_used, used)) break; } } #endif /* KSTACK_USAGE_PROF */ /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ int vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2, struct vmspace *vm2, int flags) { struct proc *p1 = td->td_proc; struct domainset *dset; int error; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { - if (p1->p_vmspace->vm_refcnt > 1) { + if (refcount_load(&p1->p_vmspace->vm_refcnt) > 1) { error = vmspace_unshare(p1); if (error) return (error); } } cpu_fork(td, p2, td2, flags); return (0); } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; - atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); + refcount_acquire(&p1->p_vmspace->vm_refcnt); } dset = td2->td_domain.dr_policy; while (vm_page_count_severe_set(&dset->ds_mask)) { vm_wait_doms(&dset->ds_mask, 0); } if ((flags & RFMEM) == 0) { p2->p_vmspace = vm2; if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); return (0); } /* * Called after process has been wait(2)'ed upon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { vmspace_exitfree(p); /* and clean-out the vmspace */ } void kick_proc0(void) { wakeup(&proc0); } Index: head/sys/vm/vm_map.c =================================================================== --- head/sys/vm/vm_map.c (revision 367333) +++ head/sys/vm/vm_map.c (revision 367334) @@ -1,5277 +1,5263 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a self-adjusting binary search tree of these * entries is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; static uma_zone_t vmspace_zone; static int vmspace_zinit(void *mem, int size, int flags); static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max); static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry); static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg); #endif static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow); static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr); #define ENTRY_CHARGED(e) ((e)->cred != NULL || \ ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. */ #define PROC_VMSPACE_LOCK(p) do { } while (0) #define PROC_VMSPACE_UNLOCK(p) do { } while (0) /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } static int vmspace_zinit(void *mem, int size, int flags) { struct vmspace *vm; vm_map_t map; vm = (struct vmspace *)mem; map = &vm->vm_map; memset(map, 0, sizeof(*map)); mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); PMAP_LOCK_INIT(vmspace_pmap(vm)); return (0); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; KASSERT(vm->vm_map.nentries == 0, ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries)); KASSERT(vm->vm_map.size == 0, ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size)); } #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. */ struct vmspace * vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); if (!pinit(vmspace_pmap(vm))) { uma_zfree(vmspace_zone, vm); return (NULL); } CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); - vm->vm_refcnt = 1; + refcount_init(&vm->vm_refcnt, 1); vm->vm_shm = NULL; vm->vm_swrss = 0; vm->vm_tsize = 0; vm->vm_dsize = 0; vm->vm_ssize = 0; vm->vm_taddr = 0; vm->vm_daddr = 0; vm->vm_maxsaddr = 0; return (vm); } #ifdef RACCT static void vmspace_container_reset(struct proc *p) { PROC_LOCK(p); racct_set(p, RACCT_DATA, 0); racct_set(p, RACCT_STACK, 0); racct_set(p, RACCT_RSS, 0); racct_set(p, RACCT_MEMLOCK, 0); racct_set(p, RACCT_VMEM, 0); PROC_UNLOCK(p); } #endif static inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); pmap_release(vmspace_pmap(vm)); vm->vm_map.pmap = NULL; uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmspace_free() called"); - if (vm->vm_refcnt == 0) - panic("vmspace_free: attempt to free already freed vmspace"); - - if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) + if (refcount_release(&vm->vm_refcnt)) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; p->p_vmspace = NULL; PROC_VMSPACE_UNLOCK(p); KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } void vmspace_exit(struct thread *td) { - int refcnt; struct vmspace *vm; struct proc *p; + bool released; - /* - * Release user portion of address space. - * This releases references to vnodes, - * which could cause I/O if the file has been unlinked. - * Need to do this early enough that we can still sleep. - * - * The last exiting process to reach this point releases as - * much of the environment as it can. vmspace_dofree() is the - * slower fallback in case another process had a temporary - * reference to the vmspace. - */ - p = td->td_proc; vm = p->p_vmspace; - atomic_add_int(&vmspace0.vm_refcnt, 1); - refcnt = vm->vm_refcnt; - do { - if (refcnt > 1 && p->p_vmspace != &vmspace0) { - /* Switch now since other proc might free vmspace */ + + /* + * Prepare to release the vmspace reference. The thread that releases + * the last reference is responsible for tearing down the vmspace. + * However, threads not releasing the final reference must switch to the + * kernel's vmspace0 before the decrement so that the subsequent pmap + * deactivation does not modify a freed vmspace. + */ + refcount_acquire(&vmspace0.vm_refcnt); + if (!(released = refcount_release_if_last(&vm->vm_refcnt))) { + if (p->p_vmspace != &vmspace0) { PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } - } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1)); - if (refcnt == 1) { + released = refcount_release(&vm->vm_refcnt); + } + if (released) { + /* + * pmap_remove_pages() expects the pmap to be active, so switch + * back first if necessary. + */ if (p->p_vmspace != vm) { - /* vmspace not yet freed, switch back */ PROC_VMSPACE_LOCK(p); p->p_vmspace = vm; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } pmap_remove_pages(vmspace_pmap(vm)); - /* Switch now since this proc will free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); vmspace_dofree(vm); } #ifdef RACCT if (racct_enable) vmspace_container_reset(p); #endif } /* Acquire reference to vmspace owned by another process. */ struct vmspace * vmspace_acquire_ref(struct proc *p) { struct vmspace *vm; - int refcnt; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; - if (vm == NULL) { + if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) { PROC_VMSPACE_UNLOCK(p); return (NULL); } - refcnt = vm->vm_refcnt; - do { - if (refcnt <= 0) { /* Avoid 0->1 transition */ - PROC_VMSPACE_UNLOCK(p); - return (NULL); - } - } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1)); if (vm != p->p_vmspace) { PROC_VMSPACE_UNLOCK(p); vmspace_free(vm); return (NULL); } PROC_VMSPACE_UNLOCK(p); return (vm); } /* * Switch between vmspaces in an AIO kernel process. * * The new vmspace is either the vmspace of a user process obtained * from an active AIO request or the initial vmspace of the AIO kernel * process (when it is idling). Because user processes will block to * drain any active AIO requests before proceeding in exit() or * execve(), the reference count for vmspaces from AIO requests can * never be 0. Similarly, AIO kernel processes hold an extra * reference on their initial vmspace for the life of the process. As * a result, the 'newvm' vmspace always has a non-zero reference * count. This permits an additional reference on 'newvm' to be * acquired via a simple atomic increment rather than the loop in * vmspace_acquire_ref() above. */ void vmspace_switch_aio(struct vmspace *newvm) { struct vmspace *oldvm; /* XXX: Need some way to assert that this is an aio daemon. */ - KASSERT(newvm->vm_refcnt > 0, + KASSERT(refcount_load(&newvm->vm_refcnt) > 0, ("vmspace_switch_aio: newvm unreferenced")); oldvm = curproc->p_vmspace; if (oldvm == newvm) return; /* * Point to the new address space and refer to it. */ curproc->p_vmspace = newvm; - atomic_add_int(&newvm->vm_refcnt, 1); + refcount_acquire(&newvm->vm_refcnt); /* Activate the new mapping. */ pmap_activate(curthread); vmspace_free(oldvm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_xlock_(&map->lock, file, line); map->timestamp++; } void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add) { vm_object_t object; struct vnode *vp; bool vp_held; if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0) return; KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with execs")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for text, entry %p", entry)); if ((object->flags & OBJ_ANON) != 0) object = object->handle; else KASSERT(object->backing_object == NULL, ("non-anon object %p shadows", object)); KASSERT(object != NULL, ("No content object for text, entry %p obj %p", entry, entry->object.vm_object)); /* * Mostly, we do not lock the backing object. It is * referenced by the entry we are processing, so it cannot go * away. */ vp = NULL; vp_held = false; if (object->type == OBJT_DEAD) { /* * For OBJT_DEAD objects, v_writecount was handled in * vnode_pager_dealloc(). */ } else if (object->type == OBJT_VNODE) { vp = object->handle; } else if (object->type == OBJT_SWAP) { KASSERT((object->flags & OBJ_TMPFS_NODE) != 0, ("vm_map_entry_set_vnode_text: swap and !TMPFS " "entry %p, object %p, add %d", entry, object, add)); /* * Tmpfs VREG node, which was reclaimed, has * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS. In * this case there is no v_writecount to adjust. */ VM_OBJECT_RLOCK(object); if ((object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; if (vp != NULL) { vhold(vp); vp_held = true; } } VM_OBJECT_RUNLOCK(object); } else { KASSERT(0, ("vm_map_entry_set_vnode_text: wrong object type, " "entry %p, object %p, add %d", entry, object, add)); } if (vp != NULL) { if (add) { VOP_SET_TEXT_CHECKED(vp); } else { vn_lock(vp, LK_SHARED | LK_RETRY); VOP_UNSET_TEXT_CHECKED(vp); VOP_UNLOCK(vp); } if (vp_held) vdrop(vp); } } /* * Use a different name for this vm_map_entry field when it's use * is not consistent with its use as part of an ordered search tree. */ #define defer_next right static void vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry, next; vm_object_t object; td = curthread; entry = td->td_map_def_user; td->td_map_def_user = NULL; while (entry != NULL) { next = entry->defer_next; MPASS((entry->eflags & (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)); if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) { /* * Decrement the object's writemappings and * possibly the vnode's v_writecount. */ KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with writecount")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for writecount")); vm_pager_release_writecount(object, entry->start, entry->end); } vm_map_entry_set_vnode_text(entry, false); vm_map_entry_deallocate(entry, FALSE); entry = next; } } #ifdef INVARIANTS static void _vm_map_assert_locked(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_assert_(&map->system_mtx, MA_OWNED, file, line); else sx_assert_(&map->lock, SA_XLOCKED, file, line); } #define VM_MAP_ASSERT_LOCKED(map) \ _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL }; #ifdef DIAGNOSTIC static int enable_vmmap_check = VMMAP_CHECK_UNLOCK; #else static int enable_vmmap_check = VMMAP_CHECK_NONE; #endif SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN, &enable_vmmap_check, 0, "Enable vm map consistency checking"); static void _vm_map_assert_consistent(vm_map_t map, int check); #define VM_MAP_ASSERT_CONSISTENT(map) \ _vm_map_assert_consistent(map, VMMAP_CHECK_ALL) #ifdef DIAGNOSTIC #define VM_MAP_UNLOCK_CONSISTENT(map) do { \ if (map->nupdates > map->nentries) { \ _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \ map->nupdates = 0; \ } \ } while (0) #else #define VM_MAP_UNLOCK_CONSISTENT(map) #endif #else #define VM_MAP_ASSERT_LOCKED(map) #define VM_MAP_ASSERT_CONSISTENT(map) #define VM_MAP_UNLOCK_CONSISTENT(map) #endif /* INVARIANTS */ void _vm_map_unlock(vm_map_t map, const char *file, int line) { VM_MAP_UNLOCK_CONSISTENT(map); if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_xunlock_(&map->lock, file, line); vm_map_process_deferred(); } } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_slock_(&map->lock, file, line); } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); } } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_xlock_(&map->lock, file, line); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_slock_(&map->lock, file, line); return (error == 0); } /* * _vm_map_lock_upgrade: [ internal use only ] * * Tries to upgrade a read (shared) lock on the specified map to a write * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a * non-zero value if the upgrade fails. If the upgrade fails, the map is * returned without a read or write lock held. * * Requires that the map be read locked. */ int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { unsigned int last_timestamp; if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { if (!sx_try_upgrade_(&map->lock, file, line)) { last_timestamp = map->timestamp; sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); /* * If the map's timestamp does not change while the * map is unlocked, then the upgrade succeeds. */ sx_xlock_(&map->lock, file, line); if (last_timestamp != map->timestamp) { sx_xunlock_(&map->lock, file, line); return (1); } } } map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { VM_MAP_UNLOCK_CONSISTENT(map); sx_downgrade_(&map->lock, file, line); } } /* * vm_map_locked: * * Returns a non-zero value if the caller holds a write (exclusive) lock * on the specified map and the value "0" otherwise. */ int vm_map_locked(vm_map_t map) { if (map->system_map) return (mtx_owned(&map->system_mtx)); else return (sx_xlocked(&map->lock)); } /* * _vm_map_unlock_and_wait: * * Atomically releases the lock on the specified map and puts the calling * thread to sleep. The calling thread will remain asleep until either * vm_map_wakeup() is performed on the map or the specified timeout is * exceeded. * * WARNING! This function does not perform deferred deallocations of * objects and map entries. Therefore, the calling thread is expected to * reacquire the map lock after reawakening and later perform an ordinary * unlock operation, such as vm_map_unlock(), before completing its * operation on the map. */ int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) { VM_MAP_UNLOCK_CONSISTENT(map); mtx_lock(&map_sleep_mtx); if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else sx_xunlock_(&map->lock, file, line); return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", timo)); } /* * vm_map_wakeup: * * Awaken any threads that have slept on the map using * vm_map_unlock_and_wait(). */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the map unlock * and the msleep() in _vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } void vm_map_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); map->busy++; } void vm_map_unbusy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); KASSERT(map->busy, ("vm_map_unbusy: not busy")); if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); wakeup(&map->busy); } } void vm_map_wait_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); while (map->busy) { vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); if (map->system_map) msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); else sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); } map->timestamp++; } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. */ static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { map->header.eflags = MAP_ENTRY_HEADER; map->needs_wakeup = FALSE; map->system_map = 0; map->pmap = pmap; map->header.end = min; map->header.start = max; map->flags = 0; map->header.left = map->header.right = &map->header; map->root = NULL; map->timestamp = 0; map->busy = 0; map->anon_loc = 0; #ifdef DIAGNOSTIC map->nupdates = 0; #endif } void vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, pmap, min, max); mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; if (map->system_map) new_entry = uma_zalloc(kmapentzone, M_NOWAIT); else new_entry = uma_zalloc(mapentzone, M_WAITOK); if (new_entry == NULL) panic("vm_map_entry_create: kernel resources exhausted"); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_max_free_{left,right}: * * Compute the size of the largest free gap between two entries, * one the root of a tree and the other the ancestor of that root * that is the least or greatest ancestor found on the search path. */ static inline vm_size_t vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor) { return (root->left != left_ancestor ? root->left->max_free : root->start - left_ancestor->end); } static inline vm_size_t vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor) { return (root->right != right_ancestor ? root->right->max_free : right_ancestor->start - root->end); } /* * vm_map_entry_{pred,succ}: * * Find the {predecessor, successor} of the entry by taking one step * in the appropriate direction and backtracking as much as necessary. * vm_map_entry_succ is defined in vm_map.h. */ static inline vm_map_entry_t vm_map_entry_pred(vm_map_entry_t entry) { vm_map_entry_t prior; prior = entry->left; if (prior->right->start < entry->start) { do prior = prior->right; while (prior->right != entry); } return (prior); } static inline vm_size_t vm_size_max(vm_size_t a, vm_size_t b) { return (a > b ? a : b); } #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \ vm_map_entry_t z; \ vm_size_t max_free; \ \ /* \ * Infer root->right->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look right to find it. \ */ \ y = root->left; \ max_free = root->max_free; \ KASSERT(max_free == vm_size_max( \ vm_map_entry_max_free_left(root, llist), \ vm_map_entry_max_free_right(root, rlist)), \ ("%s: max_free invariant fails", __func__)); \ if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \ max_free = vm_map_entry_max_free_right(root, rlist); \ if (y != llist && (test)) { \ /* Rotate right and make y root. */ \ z = y->right; \ if (z != root) { \ root->left = z; \ y->right = root; \ if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, z->max_free); \ } else if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, root->start - y->end);\ root = y; \ y = root->left; \ } \ /* Copy right->max_free. Put root on rlist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \ ("%s: max_free not copied from right", __func__)); \ root->left = rlist; \ rlist = root; \ root = y != llist ? y : NULL; \ } while (0) #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \ vm_map_entry_t z; \ vm_size_t max_free; \ \ /* \ * Infer root->left->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look left to find it. \ */ \ y = root->right; \ max_free = root->max_free; \ KASSERT(max_free == vm_size_max( \ vm_map_entry_max_free_left(root, llist), \ vm_map_entry_max_free_right(root, rlist)), \ ("%s: max_free invariant fails", __func__)); \ if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \ max_free = vm_map_entry_max_free_left(root, llist); \ if (y != rlist && (test)) { \ /* Rotate left and make y root. */ \ z = y->left; \ if (z != root) { \ root->right = z; \ y->left = root; \ if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, z->max_free); \ } else if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, y->start - root->end);\ root = y; \ y = root->right; \ } \ /* Copy left->max_free. Put root on llist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \ ("%s: max_free not copied from left", __func__)); \ root->right = llist; \ llist = root; \ root = y != rlist ? y : NULL; \ } while (0) /* * Walk down the tree until we find addr or a gap where addr would go, breaking * off left and right subtrees of nodes less than, or greater than addr. Treat * subtrees with root->max_free < length as empty trees. llist and rlist are * the two sides in reverse order (bottom-up), with llist linked by the right * pointer and rlist linked by the left pointer in the vm_map_entry, and both * lists terminated by &map->header. This function, and the subsequent call to * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address * values in &map->header. */ static __always_inline vm_map_entry_t vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length, vm_map_entry_t *llist, vm_map_entry_t *rlist) { vm_map_entry_t left, right, root, y; left = right = &map->header; root = map->root; while (root != NULL && root->max_free >= length) { KASSERT(left->end <= root->start && root->end <= right->start, ("%s: root not within tree bounds", __func__)); if (addr < root->start) { SPLAY_LEFT_STEP(root, y, left, right, y->max_free >= length && addr < y->start); } else if (addr >= root->end) { SPLAY_RIGHT_STEP(root, y, left, right, y->max_free >= length && addr >= y->end); } else break; } *llist = left; *rlist = right; return (root); } static __always_inline void vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist) { vm_map_entry_t hi, right, y; right = *rlist; hi = root->right == right ? NULL : root->right; if (hi == NULL) return; do SPLAY_LEFT_STEP(hi, y, root, right, true); while (hi != NULL); *rlist = right; } static __always_inline void vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist) { vm_map_entry_t left, lo, y; left = *llist; lo = root->left == left ? NULL : root->left; if (lo == NULL) return; do SPLAY_RIGHT_STEP(lo, y, left, root, true); while (lo != NULL); *llist = left; } static inline void vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b) { vm_map_entry_t tmp; tmp = *b; *b = *a; *a = tmp; } /* * Walk back up the two spines, flip the pointers and set max_free. The * subtrees of the root go at the bottom of llist and rlist. */ static vm_size_t vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist) { do { /* * The max_free values of the children of llist are in * llist->max_free and max_free. Update with the * max value. */ llist->max_free = max_free = vm_size_max(llist->max_free, max_free); vm_map_entry_swap(&llist->right, &tail); vm_map_entry_swap(&tail, &llist); } while (llist != header); root->left = tail; return (max_free); } /* * When llist is known to be the predecessor of root. */ static inline vm_size_t vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t llist) { vm_size_t max_free; max_free = root->start - llist->end; if (llist != header) { max_free = vm_map_splay_merge_left_walk(header, root, root, max_free, llist); } else { root->left = header; header->right = root; } return (max_free); } /* * When llist may or may not be the predecessor of root. */ static inline vm_size_t vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t llist) { vm_size_t max_free; max_free = vm_map_entry_max_free_left(root, llist); if (llist != header) { max_free = vm_map_splay_merge_left_walk(header, root, root->left == llist ? root : root->left, max_free, llist); } return (max_free); } static vm_size_t vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist) { do { /* * The max_free values of the children of rlist are in * rlist->max_free and max_free. Update with the * max value. */ rlist->max_free = max_free = vm_size_max(rlist->max_free, max_free); vm_map_entry_swap(&rlist->left, &tail); vm_map_entry_swap(&tail, &rlist); } while (rlist != header); root->right = tail; return (max_free); } /* * When rlist is known to be the succecessor of root. */ static inline vm_size_t vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t rlist) { vm_size_t max_free; max_free = rlist->start - root->end; if (rlist != header) { max_free = vm_map_splay_merge_right_walk(header, root, root, max_free, rlist); } else { root->right = header; header->left = root; } return (max_free); } /* * When rlist may or may not be the succecessor of root. */ static inline vm_size_t vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t rlist) { vm_size_t max_free; max_free = vm_map_entry_max_free_right(root, rlist); if (rlist != header) { max_free = vm_map_splay_merge_right_walk(header, root, root->right == rlist ? root : root->right, max_free, rlist); } return (max_free); } /* * vm_map_splay: * * The Sleator and Tarjan top-down splay algorithm with the * following variation. Max_free must be computed bottom-up, so * on the downward pass, maintain the left and right spines in * reverse order. Then, make a second pass up each side to fix * the pointers and compute max_free. The time bound is O(log n) * amortized. * * The tree is threaded, which means that there are no null pointers. * When a node has no left child, its left pointer points to its * predecessor, which the last ancestor on the search path from the root * where the search branched right. Likewise, when a node has no right * child, its right pointer points to its successor. The map header node * is the predecessor of the first map entry, and the successor of the * last. * * The new root is the vm_map_entry containing "addr", or else an * adjacent entry (lower if possible) if addr is not in the tree. * * The map must be locked, and leaves it so. * * Returns: the new root. */ static vm_map_entry_t vm_map_splay(vm_map_t map, vm_offset_t addr) { vm_map_entry_t header, llist, rlist, root; vm_size_t max_free_left, max_free_right; header = &map->header; root = vm_map_splay_split(map, addr, 0, &llist, &rlist); if (root != NULL) { max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else if (llist != header) { /* * Recover the greatest node in the left * subtree and make it the root. */ root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_succ(header, root, rlist); } else if (rlist != header) { /* * Recover the least node in the right * subtree and make it the root. */ root = rlist; rlist = root->left; max_free_left = vm_map_splay_merge_pred(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else { /* There is no root. */ return (NULL); } root->max_free = vm_size_max(max_free_left, max_free_right); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. On linking, if new entry clips * existing entry, trim existing entry to avoid overlap, and manage * offsets. On unlinking, merge disappearing entry with neighbor, if * called for, and manage offsets. Callers should not modify fields in * entries already mapped. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t header, llist, rlist, root; vm_size_t max_free_left, max_free_right; CTR3(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p", map, map->nentries, entry); VM_MAP_ASSERT_LOCKED(map); map->nentries++; header = &map->header; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); if (root == NULL) { /* * The new entry does not overlap any existing entry in the * map, so it becomes the new root of the map tree. */ max_free_left = vm_map_splay_merge_pred(header, entry, llist); max_free_right = vm_map_splay_merge_succ(header, entry, rlist); } else if (entry->start == root->start) { /* * The new entry is a clone of root, with only the end field * changed. The root entry will be shrunk to abut the new * entry, and will be the right child of the new root entry in * the modified map. */ KASSERT(entry->end < root->end, ("%s: clip_start not within entry", __func__)); vm_map_splay_findprev(root, &llist); root->offset += entry->end - root->start; root->start = entry->end; max_free_left = vm_map_splay_merge_pred(header, entry, llist); max_free_right = root->max_free = vm_size_max( vm_map_splay_merge_pred(entry, root, entry), vm_map_splay_merge_right(header, root, rlist)); } else { /* * The new entry is a clone of root, with only the start field * changed. The root entry will be shrunk to abut the new * entry, and will be the left child of the new root entry in * the modified map. */ KASSERT(entry->end == root->end, ("%s: clip_start not within entry", __func__)); vm_map_splay_findnext(root, &rlist); entry->offset += entry->start - root->start; root->end = entry->start; max_free_left = root->max_free = vm_size_max( vm_map_splay_merge_left(header, root, llist), vm_map_splay_merge_succ(entry, root, entry)); max_free_right = vm_map_splay_merge_succ(header, entry, rlist); } entry->max_free = vm_size_max(max_free_left, max_free_right); map->root = entry; VM_MAP_ASSERT_CONSISTENT(map); } enum unlink_merge_type { UNLINK_MERGE_NONE, UNLINK_MERGE_NEXT }; static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry, enum unlink_merge_type op) { vm_map_entry_t header, llist, rlist, root; vm_size_t max_free_left, max_free_right; VM_MAP_ASSERT_LOCKED(map); header = &map->header; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("vm_map_entry_unlink: unlink object not mapped")); vm_map_splay_findprev(root, &llist); vm_map_splay_findnext(root, &rlist); if (op == UNLINK_MERGE_NEXT) { rlist->start = root->start; rlist->offset = root->offset; } if (llist != header) { root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_succ(header, root, rlist); } else if (rlist != header) { root = rlist; rlist = root->left; max_free_left = vm_map_splay_merge_pred(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else { header->left = header->right = header; root = NULL; } if (root != NULL) root->max_free = vm_size_max(max_free_left, max_free_right); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_entry_resize: * * Resize a vm_map_entry, recompute the amount of free space that * follows it and propagate that value up the tree. * * The map must be locked, and leaves it so. */ static void vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount) { vm_map_entry_t header, llist, rlist, root; VM_MAP_ASSERT_LOCKED(map); header = &map->header; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("%s: resize object not mapped", __func__)); vm_map_splay_findnext(root, &rlist); entry->end += grow_amount; root->max_free = vm_size_max( vm_map_splay_merge_left(header, root, llist), vm_map_splay_merge_succ(header, root, rlist)); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p", __func__, map, map->nentries, entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur, header, lbound, ubound; boolean_t locked; /* * If the map is empty, then the map entry immediately preceding * "address" is the map's header. */ header = &map->header; cur = map->root; if (cur == NULL) { *entry = header; return (FALSE); } if (address >= cur->start && cur->end > address) { *entry = cur; return (TRUE); } if ((locked = vm_map_locked(map)) || sx_try_upgrade(&map->lock)) { /* * Splay requires a write lock on the map. However, it only * restructures the binary search tree; it does not otherwise * change the map. Thus, the map's timestamp need not change * on a temporary upgrade. */ cur = vm_map_splay(map, address); if (!locked) { VM_MAP_UNLOCK_CONSISTENT(map); sx_downgrade(&map->lock); } /* * If "address" is contained within a map entry, the new root * is that map entry. Otherwise, the new root is a map entry * immediately before or after "address". */ if (address < cur->start) { *entry = header; return (FALSE); } *entry = cur; return (address < cur->end); } /* * Since the map is only locked for read access, perform a * standard binary search tree lookup for "address". */ lbound = ubound = header; for (;;) { if (address < cur->start) { ubound = cur; cur = cur->left; if (cur == lbound) break; } else if (cur->end <= address) { lbound = cur; cur = cur->right; if (cur == ubound) break; } else { *entry = cur; return (TRUE); } } *entry = lbound; return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, next_entry, prev_entry; struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; u_long bdry; u_int bidx; VM_MAP_ASSERT_LOCKED(map); KASSERT(object != kernel_object || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kernel object and COW")); KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 || (cow & MAP_SPLIT_BOUNDARY_MASK) != 0, ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x", object, cow)); KASSERT((prot & ~max) == 0, ("prot %#x is not subset of max_prot %#x", prot, max)); /* * Check that the start and end points are not bogus. */ if (start == end || !vm_map_range_valid(map, start, end)) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &prev_entry)) return (KERN_NO_SPACE); /* * Assert that the next entry doesn't overlap the end point. */ next_entry = vm_map_entry_succ(prev_entry); if (next_entry->start < end) return (KERN_NO_SPACE); if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || max != VM_PROT_NONE)) return (KERN_INVALID_ARGUMENT); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) protoeflags |= MAP_ENTRY_NOFAULT; if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (cow & MAP_STACK_GROWS_DOWN) protoeflags |= MAP_ENTRY_GROWS_DOWN; if (cow & MAP_STACK_GROWS_UP) protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_WRITECOUNT) protoeflags |= MAP_ENTRY_WRITECNT; if (cow & MAP_VN_EXEC) protoeflags |= MAP_ENTRY_VN_EXEC; if ((cow & MAP_CREATE_GUARD) != 0) protoeflags |= MAP_ENTRY_GUARD; if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_DN; if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_UP; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) { /* This magically ignores index 0, for usual page size. */ bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >> MAP_SPLIT_BOUNDARY_SHIFT; if (bidx >= MAXPAGESIZES) return (KERN_INVALID_ARGUMENT); bdry = pagesizes[bidx] - 1; if ((start & bdry) != 0 || (end & bdry) != 0) return (KERN_INVALID_ARGUMENT); protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; } cred = NULL; if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) goto charged; if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) return (KERN_RESOURCE_SHORTAGE); KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || object->cred == NULL, ("overcommit: vm_map_insert o %p", object)); cred = curthread->td_ucred; } charged: /* Expand the kernel pmap, if necessary. */ if (map == kernel_map && end > kernel_vm_end) pmap_growkernel(end); if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ if ((object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(object); if (object->ref_count > 1 || object->shadow_count != 0) vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); } } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) == protoeflags && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP | MAP_VN_EXEC)) == 0 && prev_entry->end == start && (prev_entry->cred == cred || (prev_entry->object.vm_object != NULL && prev_entry->object.vm_object->cred == cred)) && vm_object_coalesce(prev_entry->object.vm_object, prev_entry->offset, (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end), cred != NULL && (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if (prev_entry->inheritance == inheritance && prev_entry->protection == prot && prev_entry->max_protection == max && prev_entry->wired_count == 0) { KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) == 0, ("prev_entry %p has incoherent wiring", prev_entry)); if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += end - prev_entry->end; vm_map_entry_resize(map, prev_entry, end - prev_entry->end); vm_map_try_merge_entries(map, prev_entry, next_entry); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); if (cred != NULL && object != NULL && object->cred != NULL && !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { /* Object already accounts for this uid. */ cred = NULL; } } if (cred != NULL) crhold(cred); /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->cred = NULL; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->inheritance = inheritance; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; new_entry->wiring_thread = NULL; new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; new_entry->next_read = start; KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); new_entry->cred = cred; /* * Insert the new entry into the list */ vm_map_entry_link(map, new_entry); if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += new_entry->end - new_entry->start; /* * Try to coalesce the new entry with both the previous and next * entries in the list. Previously, we only attempted to coalesce * with the previous entry when object is NULL. Here, we handle the * other cases, which are less common. */ vm_map_try_merge_entries(map, prev_entry, new_entry); vm_map_try_merge_entries(map, new_entry, next_entry); if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * vm_map_findspace: * * Find the first fit (lowest VM address) for "length" free bytes * beginning at address >= start in the given map. * * In a vm_map_entry, "max_free" is the maximum amount of * contiguous free space between an entry in its subtree and a * neighbor of that entry. This allows finding a free region in * one path down the tree, so O(log n) amortized with splay * trees. * * The map must be locked, and leaves it so. * * Returns: starting address if sufficient space, * vm_map_max(map)-length+1 if insufficient space. */ vm_offset_t vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length) { vm_map_entry_t header, llist, rlist, root, y; vm_size_t left_length, max_free_left, max_free_right; vm_offset_t gap_end; /* * Request must fit within min/max VM address and must avoid * address wrap. */ start = MAX(start, vm_map_min(map)); if (start >= vm_map_max(map) || length > vm_map_max(map) - start) return (vm_map_max(map) - length + 1); /* Empty tree means wide open address space. */ if (map->root == NULL) return (start); /* * After splay_split, if start is within an entry, push it to the start * of the following gap. If rlist is at the end of the gap containing * start, save the end of that gap in gap_end to see if the gap is big * enough; otherwise set gap_end to start skip gap-checking and move * directly to a search of the right subtree. */ header = &map->header; root = vm_map_splay_split(map, start, length, &llist, &rlist); gap_end = rlist->start; if (root != NULL) { start = root->end; if (root->right != rlist) gap_end = start; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else if (rlist != header) { root = rlist; rlist = root->left; max_free_left = vm_map_splay_merge_pred(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else { root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_succ(header, root, rlist); } root->max_free = vm_size_max(max_free_left, max_free_right); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); if (length <= gap_end - start) return (start); /* With max_free, can immediately tell if no solution. */ if (root->right == header || length > root->right->max_free) return (vm_map_max(map) - length + 1); /* * Splay for the least large-enough gap in the right subtree. */ llist = rlist = header; for (left_length = 0;; left_length = vm_map_entry_max_free_left(root, llist)) { if (length <= left_length) SPLAY_LEFT_STEP(root, y, llist, rlist, length <= vm_map_entry_max_free_left(y, llist)); else SPLAY_RIGHT_STEP(root, y, llist, rlist, length > vm_map_entry_max_free_left(y, root)); if (root == NULL) break; } root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); if (rlist == header) { root->max_free = vm_size_max(max_free_left, vm_map_splay_merge_succ(header, root, rlist)); } else { y = rlist; rlist = y->left; y->max_free = vm_size_max( vm_map_splay_merge_pred(root, y, root), vm_map_splay_merge_right(header, y, rlist)); root->max_free = vm_size_max(max_free_left, y->max_free); } map->root = root; VM_MAP_ASSERT_CONSISTENT(map); return (root->end); } int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t end; int result; end = start + length; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if ((cow & MAP_CHECK_EXCL) == 0) { result = vm_map_delete(map, start, end); if (result != KERN_SUCCESS) goto out; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); } else { result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } out: vm_map_unlock(map); return (result); } static const int aslr_pages_rnd_64[2] = {0x1000, 0x10}; static const int aslr_pages_rnd_32[2] = {0x100, 0x4}; static int cluster_anon = 1; SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW, &cluster_anon, 0, "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always"); static bool clustering_anon_allowed(vm_offset_t addr) { switch (cluster_anon) { case 0: return (false); case 1: return (addr == 0); case 2: default: return (true); } } static long aslr_restarts; SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD, &aslr_restarts, 0, "Number of aslr failures"); /* * Searches for the specified amount of free space in the given map with the * specified alignment. Performs an address-ordered, first-fit search from * the given address "*addr", with an optional upper bound "max_addr". If the * parameter "alignment" is zero, then the alignment is computed from the * given (object, offset) pair so as to enable the greatest possible use of * superpage mappings. Returns KERN_SUCCESS and the address of the free space * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE. * * The map must be locked. Initially, there must be at least "length" bytes * of free space at the given address. */ static int vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment) { vm_offset_t aligned_addr, free_addr; VM_MAP_ASSERT_LOCKED(map); free_addr = *addr; KASSERT(free_addr == vm_map_findspace(map, free_addr, length), ("caller failed to provide space %#jx at address %p", (uintmax_t)length, (void *)free_addr)); for (;;) { /* * At the start of every iteration, the free space at address * "*addr" is at least "length" bytes. */ if (alignment == 0) pmap_align_superpage(object, offset, addr, length); else if ((*addr & (alignment - 1)) != 0) { *addr &= ~(alignment - 1); *addr += alignment; } aligned_addr = *addr; if (aligned_addr == free_addr) { /* * Alignment did not change "*addr", so "*addr" must * still provide sufficient free space. */ return (KERN_SUCCESS); } /* * Test for address wrap on "*addr". A wrapped "*addr" could * be a valid address, in which case vm_map_findspace() cannot * be relied upon to fail. */ if (aligned_addr < free_addr) return (KERN_NO_SPACE); *addr = vm_map_findspace(map, aligned_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) return (KERN_NO_SPACE); free_addr = *addr; if (free_addr == aligned_addr) { /* * If a successful call to vm_map_findspace() did not * change "*addr", then "*addr" must still be aligned * and provide sufficient free space. */ return (KERN_SUCCESS); } } } int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment) { /* XXXKIB ASLR eh ? */ *addr = vm_map_findspace(map, *addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) return (KERN_NO_SPACE); return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr, alignment)); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t alignment, curr_min_addr, min_addr; int gap, pidx, rv, try; bool cluster, en_aslr, update_anon; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_find: non-NULL backing object for stack")); MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || (object->flags & OBJ_COLORED) == 0)) find_space = VMFS_ANY_SPACE; if (find_space >> 8 != 0) { KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); alignment = (vm_offset_t)1 << (find_space >> 8); } else alignment = 0; en_aslr = (map->flags & MAP_ASLR) != 0; update_anon = cluster = clustering_anon_allowed(*addr) && (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 && find_space != VMFS_NO_SPACE && object == NULL && (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP | MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE; curr_min_addr = min_addr = *addr; if (en_aslr && min_addr == 0 && !cluster && find_space != VMFS_NO_SPACE && (map->flags & MAP_ASLR_IGNSTART) != 0) curr_min_addr = min_addr = vm_map_min(map); try = 0; vm_map_lock(map); if (cluster) { curr_min_addr = map->anon_loc; if (curr_min_addr == 0) cluster = false; } if (find_space != VMFS_NO_SPACE) { KASSERT(find_space == VMFS_ANY_SPACE || find_space == VMFS_OPTIMAL_SPACE || find_space == VMFS_SUPER_SPACE || alignment != 0, ("unexpected VMFS flag")); again: /* * When creating an anonymous mapping, try clustering * with an existing anonymous mapping first. * * We make up to two attempts to find address space * for a given find_space value. The first attempt may * apply randomization or may cluster with an existing * anonymous mapping. If this first attempt fails, * perform a first-fit search of the available address * space. * * If all tries failed, and find_space is * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE. * Again enable clustering and randomization. */ try++; MPASS(try <= 2); if (try == 2) { /* * Second try: we failed either to find a * suitable region for randomizing the * allocation, or to cluster with an existing * mapping. Retry with free run. */ curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ? vm_map_min(map) : min_addr; atomic_add_long(&aslr_restarts, 1); } if (try == 1 && en_aslr && !cluster) { /* * Find space for allocation, including * gap needed for later randomization. */ pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 && (find_space == VMFS_SUPER_SPACE || find_space == VMFS_OPTIMAL_SPACE) ? 1 : 0; gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR && (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ? aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx]; *addr = vm_map_findspace(map, curr_min_addr, length + gap * pagesizes[pidx]); if (*addr + length + gap * pagesizes[pidx] > vm_map_max(map)) goto again; /* And randomize the start address. */ *addr += (arc4random() % gap) * pagesizes[pidx]; if (max_addr != 0 && *addr + length > max_addr) goto again; } else { *addr = vm_map_findspace(map, curr_min_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) { if (cluster) { cluster = false; MPASS(try == 1); goto again; } rv = KERN_NO_SPACE; goto done; } } if (find_space != VMFS_ANY_SPACE && (rv = vm_map_alignspace(map, object, offset, addr, length, max_addr, alignment)) != KERN_SUCCESS) { if (find_space == VMFS_OPTIMAL_SPACE) { find_space = VMFS_ANY_SPACE; curr_min_addr = min_addr; cluster = update_anon; try = 0; goto again; } goto done; } } else if ((cow & MAP_REMAP) != 0) { if (!vm_map_range_valid(map, *addr, *addr + length)) { rv = KERN_INVALID_ADDRESS; goto done; } rv = vm_map_delete(map, *addr, *addr + length); if (rv != KERN_SUCCESS) goto done; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, max, cow); } else { rv = vm_map_insert(map, object, offset, *addr, *addr + length, prot, max, cow); } if (rv == KERN_SUCCESS && update_anon) map->anon_loc = *addr + length; done: vm_map_unlock(map); return (rv); } /* * vm_map_find_min() is a variant of vm_map_find() that takes an * additional parameter (min_addr) and treats the given address * (*addr) differently. Specifically, it treats *addr as a hint * and not as the minimum address where the mapping is created. * * This function works in two phases. First, it tries to * allocate above the hint. If that fails and the hint is * greater than min_addr, it performs a second pass, replacing * the hint with min_addr as the minimum address for the * allocation. */ int vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t hint; int rv; hint = *addr; for (;;) { rv = vm_map_find(map, object, offset, addr, length, max_addr, find_space, prot, max, cow); if (rv == KERN_SUCCESS || min_addr >= hint) return (rv); *addr = hint = min_addr; } } /* * A map entry with any of the following flags set must not be merged with * another entry. */ #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \ MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC) static bool vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry) { KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 || (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0, ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable", prev, entry)); return (prev->end == entry->start && prev->object.vm_object == entry->object.vm_object && (prev->object.vm_object == NULL || prev->offset + (prev->end - prev->start) == entry->offset) && prev->eflags == entry->eflags && prev->protection == entry->protection && prev->max_protection == entry->max_protection && prev->inheritance == entry->inheritance && prev->wired_count == entry->wired_count && prev->cred == entry->cred); } static void vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry) { /* * If the backing object is a vnode object, vm_object_deallocate() * calls vrele(). However, vrele() does not lock the vnode because * the vnode has additional references. Thus, the map lock can be * kept without causing a lock-order reversal with the vnode lock. * * Since we count the number of virtual page mappings in * object->un_pager.vnp.writemappings, the writemappings value * should not be adjusted when the entry is disposed of. */ if (entry->object.vm_object != NULL) vm_object_deallocate(entry->object.vm_object); if (entry->cred != NULL) crfree(entry->cred); vm_map_entry_dispose(map, entry); } /* * vm_map_try_merge_entries: * * Compare the given map entry to its predecessor, and merge its precessor * into it if possible. The entry remains valid, and may be extended. * The predecessor may be deleted. * * The map must be locked. */ void vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 && vm_map_mergeable_neighbors(prev_entry, entry)) { vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT); vm_map_merged_neighbor_dispose(map, prev_entry); } } /* * vm_map_entry_back: * * Allocate an object to back a map entry. */ static inline void vm_map_entry_back(vm_map_entry_t entry) { vm_object_t object; KASSERT(entry->object.vm_object == NULL, ("map entry %p has backing object", entry)); KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("map entry %p is a submap", entry)); object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL, entry->cred, entry->end - entry->start); entry->object.vm_object = object; entry->offset = 0; entry->cred = NULL; } /* * vm_map_entry_charge_object * * If there is no object backing this entry, create one. Otherwise, if * the entry has cred, give it to the backing object. */ static inline void vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("map entry %p is a submap", entry)); if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) vm_map_entry_back(entry); else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: %s: both cred e %p", __func__, entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } } /* * vm_map_entry_clone * * Create a duplicate map entry for clipping. */ static vm_map_entry_t vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); /* * Create a backing object now, if none exists, so that more individual * objects won't be created after the map entry is split. */ vm_map_entry_charge_object(map, entry); /* Clone the entry. */ new_entry = vm_map_entry_create(map); *new_entry = *entry; if (new_entry->cred != NULL) crhold(entry->cred); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); vm_map_entry_set_vnode_text(new_entry, true); /* * The object->un_pager.vnp.writemappings for the object of * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The * virtual pages are re-distributed among the clipped entries, * so the sum is left the same. */ } return (new_entry); } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ static int vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr) { vm_map_entry_t new_entry; int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p start 0x%jx", __func__, map, entry, (uintmax_t)startaddr); if (startaddr <= entry->start) return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->end > startaddr && entry->start < startaddr, ("%s: invalid clip of entry %p", __func__, entry)); bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (bdry_idx != 0) { if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0) return (KERN_INVALID_ARGUMENT); } new_entry = vm_map_entry_clone(map, entry); /* * Split off the front portion. Insert the new entry BEFORE this one, * so that this entry has the specified starting address. */ new_entry->end = startaddr; vm_map_entry_link(map, new_entry); return (KERN_SUCCESS); } /* * vm_map_lookup_clip_start: * * Find the entry at or just after 'start', and clip it if 'start' is in * the interior of the entry. Return entry after 'start', and in * prev_entry set the entry before 'start'. */ static int vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start, vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry) { vm_map_entry_t entry; int rv; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p start 0x%jx prev %p", __func__, map, (uintmax_t)start, prev_entry); if (vm_map_lookup_entry(map, start, prev_entry)) { entry = *prev_entry; rv = vm_map_clip_start(map, entry, start); if (rv != KERN_SUCCESS) return (rv); *prev_entry = vm_map_entry_pred(entry); } else entry = vm_map_entry_succ(*prev_entry); *res_entry = entry; return (KERN_SUCCESS); } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ static int vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr) { vm_map_entry_t new_entry; int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p end 0x%jx", __func__, map, entry, (uintmax_t)endaddr); if (endaddr >= entry->end) return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->start < endaddr && entry->end > endaddr, ("%s: invalid clip of entry %p", __func__, entry)); bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (bdry_idx != 0) { if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0) return (KERN_INVALID_ARGUMENT); } new_entry = vm_map_entry_clone(map, entry); /* * Split off the back portion. Insert the new entry AFTER this one, * so that this entry has the specified ending address. */ new_entry->start = endaddr; vm_map_entry_link(map, new_entry); return (KERN_SUCCESS); } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result; result = KERN_INVALID_ARGUMENT; vm_map_lock(submap); submap->flags |= MAP_IS_SUB_MAP; vm_map_unlock(submap); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end && (entry->eflags & MAP_ENTRY_COW) == 0 && entry->object.vm_object == NULL) { result = vm_map_clip_start(map, entry, start); if (result != KERN_SUCCESS) goto unlock; result = vm_map_clip_end(map, entry, end); if (result != KERN_SUCCESS) goto unlock; entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } unlock: vm_map_unlock(map); if (result != KERN_SUCCESS) { vm_map_lock(submap); submap->flags &= ~MAP_IS_SUB_MAP; vm_map_unlock(submap); } return (result); } /* * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload the specified map's pmap with mappings to the specified * object's memory-resident pages. No further physical pages are * allocated, and no further virtual pages are retrieved from secondary * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within * the specified address range are mapped. */ static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t start; vm_page_t p, p_start; vm_pindex_t mask, psize, threshold, tmpidx; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_LOCK_DOWNGRADE(object); } else VM_OBJECT_RLOCK(object); psize = atop(size); if (psize + pindex > object->size) { if (pindex >= object->size) { VM_OBJECT_RUNLOCK(object); return; } psize = object->size - pindex; } start = 0; p_start = NULL; threshold = MAX_INIT_PT; p = vm_page_find_least(object, pindex); /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && vm_page_count_severe()) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; break; } if (vm_page_all_valid(p)) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; } /* Jump ahead if a superpage mapping is possible. */ if (p->psind > 0 && ((addr + ptoa(tmpidx)) & (pagesizes[p->psind] - 1)) == 0) { mask = atop(pagesizes[p->psind]) - 1; if (tmpidx + mask < psize && vm_page_ps_test(p, PS_ALL_VALID, NULL)) { p += mask; threshold += mask; } } } else if (p_start != NULL) { pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } if (p_start != NULL) pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); VM_OBJECT_RUNLOCK(object); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { vm_map_entry_t entry, first_entry, in_tran, prev_entry; vm_object_t obj; struct ucred *cred; vm_prot_t old_prot; int rv; if (start == end) return (KERN_SUCCESS); again: in_tran = NULL; vm_map_lock(map); /* * Ensure that we are not concurrently wiring pages. vm_map_wire() may * need to fault pages into the map and will drop the map lock while * doing so, and the VM object may end up in an inconsistent state if we * update the protection on the map entry in between faults. */ vm_map_wait_busy(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) first_entry = vm_map_entry_succ(first_entry); /* * Make a first pass to check for protection violations. */ for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_GUARD) != 0) continue; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & entry->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) in_tran = entry; } /* * Postpone the operation until all in-transition map entries have * stabilized. An in-transition entry might already have its pages * wired and wired_count incremented, but not yet have its * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call * vm_fault_copy_entry() in the final loop below. */ if (in_tran != NULL) { in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(map, 0); goto again; } /* * Before changing the protections, try to reserve swap space for any * private (i.e., copy-on-write) mappings that are transitioning from * read-only to read/write access. If a reservation fails, break out * of this loop early and let the next loop simplify the entries, since * some may now be mergeable. */ rv = vm_map_clip_start(map, first_entry, start); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (rv); } for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (rv); } if (set_max || ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 || ENTRY_CHARGED(entry) || (entry->eflags & MAP_ENTRY_GUARD) != 0) { continue; } cred = curthread->td_ucred; obj = entry->object.vm_object; if (obj == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) { if (!swap_reserve(entry->end - entry->start)) { rv = KERN_RESOURCE_SHORTAGE; end = entry->end; break; } crhold(cred); entry->cred = cred; continue; } if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) continue; VM_OBJECT_WLOCK(obj); if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { VM_OBJECT_WUNLOCK(obj); continue; } /* * Charge for the whole object allocation now, since * we cannot distinguish between non-charged and * charged clipped mapping of the same object later. */ KASSERT(obj->charge == 0, ("vm_map_protect: object %p overcharged (entry %p)", obj, entry)); if (!swap_reserve(ptoa(obj->size))) { VM_OBJECT_WUNLOCK(obj); rv = KERN_RESOURCE_SHORTAGE; end = entry->end; break; } crhold(cred); obj->cred = cred; obj->charge = ptoa(obj->size); VM_OBJECT_WUNLOCK(obj); } /* * If enough swap space was available, go back and fix up protections. * Otherwise, just simplify entries, since some may have been modified. * [Note that clipping is not necessary the second time.] */ for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry; entry->start < end; vm_map_try_merge_entries(map, prev_entry, entry), prev_entry = entry, entry = vm_map_entry_succ(entry)) { if (rv != KERN_SUCCESS || (entry->eflags & MAP_ENTRY_GUARD) != 0) continue; old_prot = entry->protection; if (set_max) entry->protection = (entry->max_protection = new_prot) & old_prot; else entry->protection = new_prot; /* * For user wired map entries, the normal lazy evaluation of * write access upgrades through soft page faults is * undesirable. Instead, immediately copy any pages that are * copy-on-write and enable write access in the physical map. */ if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 && (entry->protection & VM_PROT_WRITE) != 0 && (old_prot & VM_PROT_WRITE) == 0) vm_fault_copy_entry(map, map, entry, entry, NULL); /* * When restricting access, update the physical map. Worry * about copy-on-write here. */ if ((old_prot & ~entry->protection) != 0) { #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, entry->start, entry->end, entry->protection & MASK(entry)); #undef MASK } } vm_map_try_merge_entries(map, prev_entry, entry); vm_map_unlock(map); return (rv); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t entry, prev_entry; int rv; bool modify_map; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: if (start == end) return (0); modify_map = true; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: if (start == end) return (0); modify_map = false; vm_map_lock_read(map); break; default: return (EINVAL); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (vm_mmap_to_errno(rv)); } for (; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (vm_mmap_to_errno(rv)); } switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(entry, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(entry, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(entry, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: entry->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: entry->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: entry->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: entry->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); vm_map_unlock(map); } else { vm_pindex_t pstart, pend; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ if (!vm_map_lookup_entry(map, start, &entry)) entry = vm_map_entry_succ(entry); for (; entry->start < end; entry = vm_map_entry_succ(entry)) { vm_offset_t useEnd, useStart; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; /* * MADV_FREE would otherwise rewind time to * the creation of the shadow object. Because * we hold the VM map read-locked, neither the * entry's object nor the presence of a * backing object can change. */ if (behav == MADV_FREE && entry->object.vm_object != NULL && entry->object.vm_object->backing_object != NULL) continue; pstart = OFF_TO_IDX(entry->offset); pend = pstart + atop(entry->end - entry->start); useStart = entry->start; useEnd = entry->end; if (entry->start < start) { pstart += atop(start - entry->start); useStart = start; } if (entry->end > end) { pend -= atop(entry->end - end); useEnd = end; } if (pstart >= pend) continue; /* * Perform the pmap_advise() before clearing * PGA_REFERENCED in vm_page_advise(). Otherwise, a * concurrent pmap operation, such as pmap_remove(), * could clear a reference in the pmap and set * PGA_REFERENCED on the page before the pmap_advise() * had completed. Consequently, the page would appear * referenced based upon an old reference that * occurred before this pmap_advise() ran. */ if (behav == MADV_DONTNEED || behav == MADV_FREE) pmap_advise(map->pmap, useStart, useEnd, behav); vm_object_madvise(entry->object.vm_object, pstart, pend, behav); /* * Pre-populate paging structures in the * WILLNEED case. For wired entries, the * paging structures are already populated. */ if (behav == MADV_WILLNEED && entry->wired_count == 0) { vm_map_pmap_enter(map, useStart, entry->protection, entry->object.vm_object, pstart, ptoa(pend - pstart), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vmspace_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry, lentry, prev_entry, start_entry; int rv; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: case VM_INHERIT_ZERO: break; default: return (KERN_INVALID_ARGUMENT); } if (start == end) return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry); if (rv != KERN_SUCCESS) goto unlock; if (vm_map_lookup_entry(map, end - 1, &lentry)) { rv = vm_map_clip_end(map, lentry, end); if (rv != KERN_SUCCESS) goto unlock; } if (new_inheritance == VM_INHERIT_COPY) { for (entry = start_entry; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { rv = KERN_INVALID_ARGUMENT; goto unlock; } } } for (entry = start_entry; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx", entry, (uintmax_t)entry->end, (uintmax_t)end)); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) entry->inheritance = new_inheritance; vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); unlock: vm_map_unlock(map); return (rv); } /* * vm_map_entry_in_transition: * * Release the map lock, and sleep until the entry is no longer in * transition. Awake and acquire the map lock. If the map changed while * another held the lock, lookup a possibly-changed entry at or after the * 'start' position of the old entry. */ static vm_map_entry_t vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start, vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry) { vm_map_entry_t entry; vm_offset_t start; u_int last_timestamp; VM_MAP_ASSERT_LOCKED(map); KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("not in-tranition map entry %p", in_entry)); /* * We have not yet clipped the entry. */ start = MAX(in_start, in_entry->start); in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user wiring/unwiring? */ } vm_map_lock(map); if (last_timestamp + 1 == map->timestamp) return (in_entry); /* * Look again for the entry because the map was modified while it was * unlocked. Specifically, the entry may have been clipped, merged, or * deleted. */ if (!vm_map_lookup_entry(map, start, &entry)) { if (!holes_ok) { *io_end = start; return (NULL); } entry = vm_map_entry_succ(entry); } return (entry); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, next_entry, prev_entry; int rv; bool holes_ok, need_wakeup, user_unwire; if (start == end) return (KERN_SUCCESS); holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; user_unwire = (flags & VM_MAP_WIRE_USER) != 0; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (holes_ok) first_entry = vm_map_entry_succ(first_entry); else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } rv = KERN_SUCCESS; for (entry = first_entry; entry->start < end; entry = next_entry) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ next_entry = vm_map_entry_in_transition(map, start, &end, holes_ok, entry); if (next_entry == NULL) { if (entry == first_entry) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } rv = KERN_INVALID_ADDRESS; break; } first_entry = (entry == first_entry) ? next_entry : NULL; continue; } rv = vm_map_clip_start(map, entry, start); if (rv != KERN_SUCCESS) break; rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) break; /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; next_entry = vm_map_entry_succ(entry); /* * Check the map for holes in the specified region. * If holes_ok, skip this check. */ if (!holes_ok && entry->end < end && next_entry->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; break; } /* * If system unwiring, require that the entry is system wired. */ if (!user_unwire && vm_map_entry_system_wired_count(entry) == 0) { end = entry->end; rv = KERN_INVALID_ARGUMENT; break; } } need_wakeup = false; if (first_entry == NULL && !vm_map_lookup_entry(map, start, &first_entry)) { KASSERT(holes_ok, ("vm_map_unwire: lookup failed")); prev_entry = first_entry; entry = vm_map_entry_succ(first_entry); } else { prev_entry = vm_map_entry_pred(first_entry); entry = first_entry; } for (; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { /* * If holes_ok was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for draining * MAP_ENTRY_IN_TRANSITION. Moreover, another thread * could be simultaneously wiring this new mapping * entry. Detect these cases and skip any entries * marked as in transition by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT(holes_ok, ("vm_map_unwire: !HOLESOK and new/changed entry")); continue; } if (rv == KERN_SUCCESS && (!user_unwire || (entry->eflags & MAP_ENTRY_USER_WIRED))) { if (entry->wired_count == 1) vm_map_entry_unwire(map, entry); else entry->wired_count--; if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_unwire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_unwire: alien wire %p", entry)); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = true; } vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } static void vm_map_wire_user_count_sub(u_long npages) { atomic_subtract_long(&vm_user_wire_count, npages); } static bool vm_map_wire_user_count_add(u_long npages) { u_long wired; wired = vm_user_wire_count; do { if (npages + wired > vm_page_max_user_wired) return (false); } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired, npages + wired)); return (true); } /* * vm_map_wire_entry_failure: * * Handle a wiring failure on the given entry. * * The map should be locked. */ static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && entry->wired_count == 1, ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); KASSERT(failed_addr < entry->end, ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); /* * If any pages at the start of this entry were successfully wired, * then unwire them. */ if (failed_addr > entry->start) { pmap_unwire(map->pmap, entry->start, failed_addr); vm_object_unwire(entry->object.vm_object, entry->offset, failed_addr - entry->start, PQ_ACTIVE); } /* * Assign an out-of-range value to represent the failure to wire this * entry. */ entry->wired_count = -1; } int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { int rv; vm_map_lock(map); rv = vm_map_wire_locked(map, start, end, flags); vm_map_unlock(map); return (rv); } /* * vm_map_wire_locked: * * Implements both kernel and user wiring. Returns with the map locked, * the map lock may be dropped. */ int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, next_entry, prev_entry; vm_offset_t faddr, saved_end, saved_start; u_long incr, npages; u_int bidx, last_timestamp; int rv; bool holes_ok, need_wakeup, user_wire; vm_prot_t prot; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); prot = 0; if (flags & VM_MAP_WIRE_WRITE) prot |= VM_PROT_WRITE; holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; user_wire = (flags & VM_MAP_WIRE_USER) != 0; VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (holes_ok) first_entry = vm_map_entry_succ(first_entry); else return (KERN_INVALID_ADDRESS); } for (entry = first_entry; entry->start < end; entry = next_entry) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ next_entry = vm_map_entry_in_transition(map, start, &end, holes_ok, entry); if (next_entry == NULL) { if (entry == first_entry) return (KERN_INVALID_ADDRESS); rv = KERN_INVALID_ADDRESS; goto done; } first_entry = (entry == first_entry) ? next_entry : NULL; continue; } rv = vm_map_clip_start(map, entry, start); if (rv != KERN_SUCCESS) goto done; rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) goto done; /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || (entry->protection & prot) != prot) { entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; if (!holes_ok) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } } else if (entry->wired_count == 0) { entry->wired_count++; npages = atop(entry->end - entry->start); if (user_wire && !vm_map_wire_user_count_add(npages)) { vm_map_wire_entry_failure(map, entry, entry->start); end = entry->end; rv = KERN_RESOURCE_SHORTAGE; goto done; } /* * Release the map lock, relying on the in-transition * mark. Mark the map busy for fork. */ saved_start = entry->start; saved_end = entry->end; last_timestamp = map->timestamp; bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; incr = pagesizes[bidx]; vm_map_busy(map); vm_map_unlock(map); for (faddr = saved_start; faddr < saved_end; faddr += incr) { /* * Simulate a fault to get the page and enter * it into the physical map. */ rv = vm_fault(map, faddr, VM_PROT_NONE, VM_FAULT_WIRE, NULL); if (rv != KERN_SUCCESS) break; } vm_map_lock(map); vm_map_unbusy(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ if (!vm_map_lookup_entry(map, saved_start, &next_entry)) KASSERT(false, ("vm_map_wire: lookup failed")); first_entry = (entry == first_entry) ? next_entry : NULL; for (entry = next_entry; entry->end < saved_end; entry = vm_map_entry_succ(entry)) { /* * In case of failure, handle entries * that were not fully wired here; * fully wired entries are handled * later. */ if (rv != KERN_SUCCESS && faddr < entry->end) vm_map_wire_entry_failure(map, entry, faddr); } } if (rv != KERN_SUCCESS) { vm_map_wire_entry_failure(map, entry, faddr); if (user_wire) vm_map_wire_user_count_sub(npages); end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If holes_ok was specified, skip this check. */ next_entry = vm_map_entry_succ(entry); if (!holes_ok && entry->end < end && next_entry->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } } rv = KERN_SUCCESS; done: need_wakeup = false; if (first_entry == NULL && !vm_map_lookup_entry(map, start, &first_entry)) { KASSERT(holes_ok, ("vm_map_wire: lookup failed")); prev_entry = first_entry; entry = vm_map_entry_succ(first_entry); } else { prev_entry = vm_map_entry_pred(first_entry); entry = first_entry; } for (; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { /* * If holes_ok was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for faulting in the * pages or draining MAP_ENTRY_IN_TRANSITION. * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition not by us. * * Another way to get an entry not marked with * MAP_ENTRY_IN_TRANSITION is after failed clipping, * which set rv to KERN_INVALID_ARGUMENT. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) { /* do nothing */ } else if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { /* * Undo the wiring. Wiring succeeded on this entry * but failed on a later entry. */ if (entry->wired_count == 1) { vm_map_entry_unwire(map, entry); if (user_wire) vm_map_wire_user_count_sub( atop(entry->end - entry->start)); } else entry->wired_count--; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_wire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_wire: alien wire %p", entry)); entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WIRE_SKIPPED); entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = true; } vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t entry, first_entry, next_entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; int bdry_idx; boolean_t failed; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = first_entry->start; end = first_entry->end; } /* * Make a first pass to check for user-wired memory, holes, * and partial invalidation of largepage mappings. */ for (entry = first_entry; entry->start < end; entry = next_entry) { if (invalidate) { if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (bdry_idx != 0 && ((start & (pagesizes[bdry_idx] - 1)) != 0 || (end & (pagesizes[bdry_idx] - 1)) != 0)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } } next_entry = vm_map_entry_succ(entry); if (end > entry->end && entry->end != next_entry->start) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) pmap_remove(map->pmap, start, end); failed = FALSE; /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (entry = first_entry; entry->start < end;) { offset = entry->offset + (start - entry->start); size = (end <= entry->end ? end : entry->end) - start; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = entry->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = entry->object.vm_object; } vm_object_reference(object); last_timestamp = map->timestamp; vm_map_unlock_read(map); if (!vm_object_sync(object, offset, size, syncio, invalidate)) failed = TRUE; start += size; vm_object_deallocate(object); vm_map_lock_read(map); if (last_timestamp == map->timestamp || !vm_map_lookup_entry(map, start, &entry)) entry = vm_map_entry_succ(entry); } vm_map_unlock_read(map); return (failed ? KERN_FAILURE : KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { vm_size_t size; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->wired_count > 0, ("vm_map_entry_unwire: entry %p isn't wired", entry)); size = entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) vm_map_wire_user_count_sub(atop(size)); pmap_unwire(map->pmap, entry->start, entry->end); vm_object_unwire(entry->object.vm_object, entry->offset, size, PQ_ACTIVE); entry->wired_count = 0; } static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) vm_object_deallocate(entry->object.vm_object); uma_zfree(system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, size1; vm_size_t size; vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE); object = entry->object.vm_object; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { MPASS(entry->cred == NULL); MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); MPASS(object == NULL); vm_map_entry_deallocate(entry, map->system_map); return; } size = entry->end - entry->start; map->size -= size; if (entry->cred != NULL) { swap_release_by_cred(size, entry->cred); crfree(entry->cred); } if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) { entry->object.vm_object = NULL; } else if ((object->flags & OBJ_ANON) != 0 || object == kernel_object) { KASSERT(entry->cred == NULL || object->cred == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY), ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + atop(size); VM_OBJECT_WLOCK(object); if (object->ref_count != 1 && ((object->flags & OBJ_ONEMAPPING) != 0 || object == kernel_object)) { vm_object_collapse(object); /* * The option OBJPR_NOTMAPPED can be passed here * because vm_map_delete() already performed * pmap_remove() on the only mapping to this range * of pages. */ vm_object_page_remove(object, offidxstart, offidxend, OBJPR_NOTMAPPED); if (offidxend >= object->size && offidxstart < object->size) { size1 = object->size; object->size = offidxstart; if (object->cred != NULL) { size1 -= object->size; KASSERT(object->charge >= ptoa(size1), ("object %p charge < 0", object)); swap_release_by_cred(ptoa(size1), object->cred); object->charge -= ptoa(size1); } } } VM_OBJECT_WUNLOCK(object); } if (map->system_map) vm_map_entry_deallocate(entry, TRUE); else { entry->defer_next = curthread->td_map_def_user; curthread->td_map_def_user = entry; } } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry, next_entry, scratch_entry; int rv; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); /* * Find the start of the region, and clip it. * Step through all entries in this region. */ rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry); if (rv != KERN_SUCCESS) return (rv); for (; entry->start < end; entry = next_entry) { /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on * user maps. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && vm_map_entry_system_wired_count(entry) != 0)) { unsigned int last_timestamp; vm_offset_t saved_start; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, 0); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ rv = vm_map_lookup_clip_start(map, saved_start, &next_entry, &scratch_entry); if (rv != KERN_SUCCESS) break; } else next_entry = entry; continue; } /* XXXKIB or delete to the upper superpage boundary ? */ rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) break; next_entry = vm_map_entry_succ(entry); /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); /* * Remove mappings for the pages, but only if the * mappings could exist. For instance, it does not * make sense to call pmap_remove() for guard entries. */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || entry->object.vm_object != NULL) pmap_remove(map->pmap, entry->start, entry->end); if (entry->end == map->anon_loc) map->anon_loc = entry->start; /* * Delete the entry only after removing all pmap * entries pointing to its pages. (Otherwise, its * page frames may be reallocated, and any modify bits * will be set in the wrong object!) */ vm_map_entry_delete(map, entry); } return (rv); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = vm_map_entry_succ(entry); } return (TRUE); } /* * * vm_map_copy_swap_object: * * Copies a swap-backed object from an existing map entry to a * new one. Carries forward the swap charge. May change the * src object on return. */ static void vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_offset_t size, vm_ooffset_t *fork_charge) { vm_object_t src_object; struct ucred *cred; int charged; src_object = src_entry->object.vm_object; charged = ENTRY_CHARGED(src_entry); if ((src_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(src_object); vm_object_collapse(src_object); if ((src_object->flags & OBJ_ONEMAPPING) != 0) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(src_object); } else vm_object_reference(src_object); if (src_entry->cred != NULL && !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { KASSERT(src_object->cred == NULL, ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p", src_object)); src_object->cred = src_entry->cred; src_object->charge = size; } dst_entry->object.vm_object = src_object; if (charged) { cred = curthread->td_ucred; crhold(cred); dst_entry->cred = cred; *fork_charge += size; if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { crhold(cred); src_entry->cred = cred; *fork_charge += size; } } } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_ooffset_t *fork_charge) { vm_object_t src_object; vm_map_entry_t fake_entry; vm_offset_t size; VM_MAP_ASSERT_LOCKED(dst_map); if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0 || (src_entry->protection & VM_PROT_WRITE) == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && (src_entry->protection & VM_PROT_WRITE) != 0) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } /* * Make a copy of the object. */ size = src_entry->end - src_entry->start; if ((src_object = src_entry->object.vm_object) != NULL) { if (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP) { vm_map_copy_swap_object(src_entry, dst_entry, size, fork_charge); /* May have split/collapsed, reload obj. */ src_object = src_entry->object.vm_object; } else { vm_object_reference(src_object); dst_entry->object.vm_object = src_object; } src_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->offset = src_entry->offset; if (src_entry->eflags & MAP_ENTRY_WRITECNT) { /* * MAP_ENTRY_WRITECNT cannot * indicate write reference from * src_entry, since the entry is * marked as needs copy. Allocate a * fake entry that is used to * decrement object->un_pager writecount * at the appropriate time. Attach * fake_entry to the deferred list. */ fake_entry = vm_map_entry_create(dst_map); fake_entry->eflags = MAP_ENTRY_WRITECNT; src_entry->eflags &= ~MAP_ENTRY_WRITECNT; vm_object_reference(src_object); fake_entry->object.vm_object = src_object; fake_entry->start = src_entry->start; fake_entry->end = src_entry->end; fake_entry->defer_next = curthread->td_map_def_user; curthread->td_map_def_user = fake_entry; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; if (src_entry->cred != NULL) { dst_entry->cred = curthread->td_ucred; crhold(dst_entry->cred); *fork_charge += size; } } } else { /* * We don't want to make writeable wired pages copy-on-write. * Immediately copy these pages into the new map by simulating * page faults. The new pages are pageable. */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, fork_charge); } } /* * vmspace_map_entry_forked: * Update the newly-forked vmspace each time a map entry is inherited * or copied. The values for vm_dsize and vm_tsize are approximate * (and mostly-obsolete ideas in the face of mmap(2) et al.) */ static void vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_map_entry_t entry) { vm_size_t entrysize; vm_offset_t newend; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) return; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { vm2->vm_ssize += btoc(entrysize); } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); vm2->vm_dsize += btoc(newend - entry->start); } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); vm2->vm_tsize += btoc(newend - entry->start); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * XXX It might be worth coalescing the entries added to the new vmspace. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) { struct vmspace *vm2; vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; int error, locked; vm_inherit_t inh; old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), pmap_pinit); if (vm2 == NULL) return (NULL); vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); new_map = &vm2->vm_map; locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); if (error != 0) { sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); vmspace_free(vm2); return (NULL); } new_map->anon_loc = old_map->anon_loc; new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART); VM_MAP_ENTRY_FOREACH(old_entry, old_map) { if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) panic("vm_map_fork: encountered a submap"); inh = old_entry->inheritance; if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && inh != VM_INHERIT_NONE) inh = VM_INHERIT_COPY; switch (inh) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if * necessary. */ object = old_entry->object.vm_object; if (object == NULL) { vm_map_entry_back(old_entry); object = old_entry->object.vm_object; } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, old_entry->end - old_entry->start, old_entry->cred, /* Transfer the second reference too. */ true); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; old_entry->cred = NULL; /* * As in vm_map_merged_neighbor_dispose(), * the vnode lock will not be acquired in * this call to vm_object_deallocate(). */ vm_object_deallocate(object); object = old_entry->object.vm_object; } else { VM_OBJECT_WLOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); if (old_entry->cred != NULL) { KASSERT(object->cred == NULL, ("vmspace_fork both cred")); object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } /* * Assert the correct state of the vnode * v_writecount while the object is locked, to * not relock it later for the assertion * correctness. */ if (old_entry->eflags & MAP_ENTRY_WRITECNT && object->type == OBJT_VNODE) { KASSERT(((struct vnode *)object-> handle)->v_writecount > 0, ("vmspace_fork: v_writecount %p", object)); KASSERT(object->un_pager.vnp. writemappings > 0, ("vmspace_fork: vnp.writecount %p", object)); } VM_OBJECT_WUNLOCK(object); } /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; if (new_entry->eflags & MAP_ENTRY_WRITECNT) { vm_pager_update_writecount(object, new_entry->start, new_entry->end); } vm_map_entry_set_vnode_text(new_entry, true); /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; /* * Copied entry is COW over the old object. */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry, fork_charge); vm_map_entry_set_vnode_text(new_entry, true); break; case VM_INHERIT_ZERO: /* * Create a new anonymous mapping entry modelled from * the old one. */ new_entry = vm_map_entry_create(new_map); memset(new_entry, 0, sizeof(*new_entry)); new_entry->start = old_entry->start; new_entry->end = old_entry->end; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC | MAP_ENTRY_SPLIT_BOUNDARY_MASK); new_entry->protection = old_entry->protection; new_entry->max_protection = old_entry->max_protection; new_entry->inheritance = VM_INHERIT_ZERO; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); new_entry->cred = curthread->td_ucred; crhold(new_entry->cred); *fork_charge += (new_entry->end - new_entry->start); break; } } /* * Use inlined vm_map_unlock() to postpone handling the deferred * map entries, which cannot be done until both old_map and * new_map locks are released. */ sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); return (vm2); } /* * Create a process's stack for exec_new_vmspace(). This function is never * asked to wire the newly created stack. */ int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_size_t growsize, init_ssize; rlim_t vmemlim; int rv; MPASS((map->flags & MAP_WIREFUTURE) == 0); growsize = sgrowsiz; init_ssize = (max_ssize < growsize) ? max_ssize : growsize; vm_map_lock(map); vmemlim = lim_cur(curthread, RLIMIT_VMEM); /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { rv = KERN_NO_SPACE; goto out; } rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, max, cow); out: vm_map_unlock(map); return (rv); } static int stack_guard_page = 1; SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, &stack_guard_page, 0, "Specifies the number of guard pages for a stack that grows"); static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, gap_bot, gap_top, top; vm_size_t init_ssize, sgp; int orient, rv; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. */ orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); KASSERT(orient != 0, ("No stack grow direction")); KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), ("bi-dir stack")); if (max_ssize == 0 || !vm_map_range_valid(map, addrbos, addrbos + max_ssize)) return (KERN_INVALID_ADDRESS); sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : (vm_size_t)stack_guard_page * PAGE_SIZE; if (sgp >= max_ssize) return (KERN_INVALID_ARGUMENT); init_ssize = growsize; if (max_ssize < init_ssize + sgp) init_ssize = max_ssize - sgp; /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) return (KERN_NO_SPACE); /* * If we can't accommodate max_ssize in the current mapping, no go. */ if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize) return (KERN_NO_SPACE); /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) { bot = addrbos + max_ssize - init_ssize; top = bot + init_ssize; gap_bot = addrbos; gap_top = bot; } else /* if (orient == MAP_STACK_GROWS_UP) */ { bot = addrbos; top = bot + init_ssize; gap_bot = top; gap_top = addrbos + max_ssize; } rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); if (rv != KERN_SUCCESS) return (rv); new_entry = vm_map_entry_succ(prev_entry); KASSERT(new_entry->end == top || new_entry->start == bot, ("Bad entry start/end for new stack entry")); KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, ("new entry lacks MAP_ENTRY_GROWS_UP")); if (gap_bot == gap_top) return (KERN_SUCCESS); rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); if (rv == KERN_SUCCESS) { /* * Gap can never successfully handle a fault, so * read-ahead logic is never used for it. Re-use * next_read of the gap entry to store * stack_guard_page for vm_map_growstack(). */ if (orient == MAP_STACK_GROWS_DOWN) vm_map_entry_pred(new_entry)->next_read = sgp; else vm_map_entry_succ(new_entry)->next_read = sgp; } else { (void)vm_map_delete(map, bot, top); } return (rv); } /* * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we * successfully grow the stack. */ static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) { vm_map_entry_t stack_entry; struct proc *p; struct vmspace *vm; struct ucred *cred; vm_offset_t gap_end, gap_start, grow_start; vm_size_t grow_amount, guard, max_grow; rlim_t lmemlim, stacklim, vmemlim; int rv, rv1; bool gap_deleted, grow_down, is_procstack; #ifdef notyet uint64_t limit; #endif #ifdef RACCT int error; #endif p = curproc; vm = p->p_vmspace; /* * Disallow stack growth when the access is performed by a * debugger or AIO daemon. The reason is that the wrong * resource limits are applied. */ if (p != initproc && (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)) return (KERN_FAILURE); MPASS(!map->system_map); lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); stacklim = lim_cur(curthread, RLIMIT_STACK); vmemlim = lim_cur(curthread, RLIMIT_VMEM); retry: /* If addr is not in a hole for a stack grow area, no need to grow. */ if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) return (KERN_FAILURE); if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) return (KERN_SUCCESS); if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { stack_entry = vm_map_entry_succ(gap_entry); if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || stack_entry->start != gap_entry->end) return (KERN_FAILURE); grow_amount = round_page(stack_entry->start - addr); grow_down = true; } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { stack_entry = vm_map_entry_pred(gap_entry); if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || stack_entry->end != gap_entry->start) return (KERN_FAILURE); grow_amount = round_page(addr + 1 - stack_entry->end); grow_down = false; } else { return (KERN_FAILURE); } guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : gap_entry->next_read; max_grow = gap_entry->end - gap_entry->start; if (guard > max_grow) return (KERN_NO_SPACE); max_grow -= guard; if (grow_amount > max_grow) return (KERN_NO_SPACE); /* * If this is the main process stack, see if we're over the stack * limit. */ is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && addr < (vm_offset_t)p->p_sysent->sv_usrstack; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) return (KERN_NO_SPACE); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (is_procstack && racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { PROC_UNLOCK(p); return (KERN_NO_SPACE); } PROC_UNLOCK(p); } #endif grow_amount = roundup(grow_amount, sgrowsiz); if (grow_amount > max_grow) grow_amount = max_grow; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } #ifdef notyet PROC_LOCK(p); limit = racct_get_available(p, RACCT_STACK); PROC_UNLOCK(p); if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) grow_amount = limit - ctob(vm->vm_ssize); #endif if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif if (vm_map_lock_upgrade(map)) { gap_entry = NULL; vm_map_lock_read(map); goto retry; } if (grow_down) { grow_start = gap_entry->end - grow_amount; if (gap_entry->start + grow_amount == gap_entry->end) { gap_start = gap_entry->start; gap_end = gap_entry->end; vm_map_entry_delete(map, gap_entry); gap_deleted = true; } else { MPASS(gap_entry->start < gap_entry->end - grow_amount); vm_map_entry_resize(map, gap_entry, -grow_amount); gap_deleted = false; } rv = vm_map_insert(map, NULL, 0, grow_start, grow_start + grow_amount, stack_entry->protection, stack_entry->max_protection, MAP_STACK_GROWS_DOWN); if (rv != KERN_SUCCESS) { if (gap_deleted) { rv1 = vm_map_insert(map, NULL, 0, gap_start, gap_end, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); MPASS(rv1 == KERN_SUCCESS); } else vm_map_entry_resize(map, gap_entry, grow_amount); } } else { grow_start = stack_entry->end; cred = stack_entry->cred; if (cred == NULL && stack_entry->object.vm_object != NULL) cred = stack_entry->object.vm_object->cred; if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) rv = KERN_NO_SPACE; /* Grow the underlying object if applicable. */ else if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), grow_amount, cred != NULL)) { if (gap_entry->start + grow_amount == gap_entry->end) { vm_map_entry_delete(map, gap_entry); vm_map_entry_resize(map, stack_entry, grow_amount); } else { gap_entry->start += grow_amount; stack_entry->end += grow_amount; } map->size += grow_amount; rv = KERN_SUCCESS; } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { rv = vm_map_wire_locked(map, grow_start, grow_start + grow_amount, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); } vm_map_lock_downgrade(map); out: #ifdef RACCT if (racct_enable && rv != KERN_SUCCESS) { PROC_LOCK(p); error = racct_set(p, RACCT_VMEM, map->size); KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); if (!old_mlock) { error = racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); } error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); KASSERT(error == 0, ("decreasing RACCT_STACK failed")); PROC_UNLOCK(p); } #endif return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ int vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, ("vmspace_exec recursed")); newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); if (newvmspace == NULL) return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); curthread->td_pflags |= TDP_EXECVMSPC; return (0); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ int vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; vm_ooffset_t fork_charge; - if (oldvmspace->vm_refcnt == 1) + if (refcount_load(&oldvmspace->vm_refcnt) == 1) return (0); fork_charge = 0; newvmspace = vmspace_fork(oldvmspace, &fork_charge); if (newvmspace == NULL) return (ENOMEM); if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { vmspace_free(newvmspace); return (ENOMEM); } PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); vmspace_free(oldvmspace); return (0); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type; vm_object_t eobject; vm_size_t size; struct ucred *cred; RetryLookup: vm_map_lock_read(map); RetryLookupLocked: /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } entry = *out_entry; /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { fault_typea &= ~VM_PROT_FAULT_LOOKUP; if (prot == VM_PROT_NONE && map != kernel_map && (entry->eflags & MAP_ENTRY_GUARD) != 0 && (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | MAP_ENTRY_STACK_GAP_UP)) != 0 && vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) goto RetryLookupLocked; } fault_type = fault_typea & VM_PROT_ALL; if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), ("entry %p flags %x", entry, entry->eflags)); if ((fault_typea & VM_PROT_COPY) != 0 && (entry->max_protection & VM_PROT_WRITE) == 0 && (entry->eflags & MAP_ENTRY_COW) == 0) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; size = entry->end - entry->start; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if ((fault_type & VM_PROT_WRITE) != 0 || (fault_typea & VM_PROT_COPY) != 0) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; if (entry->cred == NULL) { /* * The debugger owner is charged for * the memory. */ cred = curthread->td_ucred; crhold(cred); if (!swap_reserve_by_cred(size, cred)) { crfree(cred); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } entry->cred = cred; } eobject = entry->object.vm_object; vm_object_shadow(&entry->object.vm_object, &entry->offset, size, entry->cred, false); if (eobject == entry->object.vm_object) { /* * The object was not shadowed. */ swap_release_by_cred(size, entry->cred); crfree(entry->cred); } entry->cred = NULL; entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate_anon(atop(size), NULL, entry->cred, entry->cred != NULL ? size : 0); entry->offset = 0; entry->cred = NULL; vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_locked: * * Lookup the faulting address. A version of vm_map_lookup that returns * KERN_FAILURE instead of blocking on map lock or memory allocation. */ int vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) return (KERN_INVALID_ADDRESS); entry = *out_entry; /* * Fail if the entry refers to a submap. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) return (KERN_FAILURE); /* * Check whether this task is allowed to have this page. */ prot = entry->protection; fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * Fail if the entry was copy-on-write for a write fault. */ if (fault_type & VM_PROT_WRITE) return (KERN_FAILURE); /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } /* * Fail if an object should be created. */ if (entry->object.vm_object == NULL && !map->system_map) return (KERN_FAILURE); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } vm_offset_t vm_map_max_KBI(const struct vm_map *map) { return (vm_map_max(map)); } vm_offset_t vm_map_min_KBI(const struct vm_map *map) { return (vm_map_min(map)); } pmap_t vm_map_pmap_KBI(vm_map_t map) { return (map->pmap); } bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end) { return (vm_map_range_valid(map, start, end)); } #ifdef INVARIANTS static void _vm_map_assert_consistent(vm_map_t map, int check) { vm_map_entry_t entry, prev; vm_map_entry_t cur, header, lbound, ubound; vm_size_t max_left, max_right; #ifdef DIAGNOSTIC ++map->nupdates; #endif if (enable_vmmap_check != check) return; header = prev = &map->header; VM_MAP_ENTRY_FOREACH(entry, map) { KASSERT(prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)prev->end, (uintmax_t)entry->start)); KASSERT(entry->start < entry->end, ("map %p start = %jx, end = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->end)); KASSERT(entry->left == header || entry->left->start < entry->start, ("map %p left->start = %jx, start = %jx", map, (uintmax_t)entry->left->start, (uintmax_t)entry->start)); KASSERT(entry->right == header || entry->start < entry->right->start, ("map %p start = %jx, right->start = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->right->start)); cur = map->root; lbound = ubound = header; for (;;) { if (entry->start < cur->start) { ubound = cur; cur = cur->left; KASSERT(cur != lbound, ("map %p cannot find %jx", map, (uintmax_t)entry->start)); } else if (cur->end <= entry->start) { lbound = cur; cur = cur->right; KASSERT(cur != ubound, ("map %p cannot find %jx", map, (uintmax_t)entry->start)); } else { KASSERT(cur == entry, ("map %p cannot find %jx", map, (uintmax_t)entry->start)); break; } } max_left = vm_map_entry_max_free_left(entry, lbound); max_right = vm_map_entry_max_free_right(entry, ubound); KASSERT(entry->max_free == vm_size_max(max_left, max_right), ("map %p max = %jx, max_left = %jx, max_right = %jx", map, (uintmax_t)entry->max_free, (uintmax_t)max_left, (uintmax_t)max_right)); prev = entry; } KASSERT(prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)prev->end, (uintmax_t)entry->start)); } #endif #include "opt_ddb.h" #ifdef DDB #include #include static void vm_map_print(vm_map_t map) { vm_map_entry_t entry, prev; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); db_indent += 2; prev = &map->header; VM_MAP_ENTRY_FOREACH(entry, map) { db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", (void *)entry, (void *)entry->start, (void *)entry->end, entry->eflags); { static const char * const inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char) entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); if (prev == &map->header || prev->object.sub_map != entry->object.sub_map) { db_indent += 2; vm_map_print((vm_map_t)entry->object.sub_map); db_indent -= 2; } } else { if (entry->cred != NULL) db_printf(", ruid %d", entry->cred->cr_ruid); db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->object.vm_object && entry->object.vm_object->cred) db_printf(", obj ruid %d charge %jx", entry->object.vm_object->cred->cr_ruid, (uintmax_t)entry->object.vm_object->charge); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); if (prev == &map->header || prev->object.vm_object != entry->object.vm_object) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, 0, 0, (char *)0); db_indent -= 2; } } prev = entry; } db_indent -= 2; } DB_SHOW_COMMAND(map, map) { if (!have_addr) { db_printf("usage: show map \n"); return; } vm_map_print((vm_map_t)addr); } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = db_lookup_proc(addr); } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((vm_map_t)&p->p_vmspace->vm_map); } #endif /* DDB */ Index: head/sys/vm/vm_map.h =================================================================== --- head/sys/vm/vm_map.h (revision 367333) +++ head/sys/vm/vm_map.h (revision 367334) @@ -1,528 +1,528 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_map.h 8.9 (Berkeley) 5/17/95 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory map module definitions. */ #ifndef _VM_MAP_ #define _VM_MAP_ #include #include #include /* * Types defined: * * vm_map_t the high-level address map data structure. * vm_map_entry_t an entry in an address map. */ typedef u_char vm_flags_t; typedef u_int vm_eflags_t; /* * Objects which live in maps may be either VM objects, or * another map (called a "sharing map") which denotes read-write * sharing with other maps. */ union vm_map_object { struct vm_object *vm_object; /* object object */ struct vm_map *sub_map; /* belongs to another map */ }; /* * Address map entries consist of start and end addresses, * a VM object (or sharing map) and offset into that object, * and user-exported inheritance and protection information. * Also included is control information for virtual copy operations. */ struct vm_map_entry { struct vm_map_entry *left; /* left child or previous entry */ struct vm_map_entry *right; /* right child or next entry */ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ vm_offset_t next_read; /* vaddr of the next sequential read */ vm_size_t max_free; /* max free space in subtree */ union vm_map_object object; /* object I point to */ vm_ooffset_t offset; /* offset into object */ vm_eflags_t eflags; /* map entry flags */ vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ vm_inherit_t inheritance; /* inheritance */ uint8_t read_ahead; /* pages in the read-ahead window */ int wired_count; /* can be paged if = 0 */ struct ucred *cred; /* tmp storage for creator ref */ struct thread *wiring_thread; }; #define MAP_ENTRY_NOSYNC 0x00000001 #define MAP_ENTRY_IS_SUB_MAP 0x00000002 #define MAP_ENTRY_COW 0x00000004 #define MAP_ENTRY_NEEDS_COPY 0x00000008 #define MAP_ENTRY_NOFAULT 0x00000010 #define MAP_ENTRY_USER_WIRED 0x00000020 #define MAP_ENTRY_BEHAV_NORMAL 0x00000000 /* default behavior */ #define MAP_ENTRY_BEHAV_SEQUENTIAL 0x00000040 /* expect sequential access */ #define MAP_ENTRY_BEHAV_RANDOM 0x00000080 /* expect random access */ #define MAP_ENTRY_BEHAV_RESERVED 0x000000c0 /* future use */ #define MAP_ENTRY_BEHAV_MASK 0x000000c0 #define MAP_ENTRY_IN_TRANSITION 0x00000100 /* entry being changed */ #define MAP_ENTRY_NEEDS_WAKEUP 0x00000200 /* waiters in transition */ #define MAP_ENTRY_NOCOREDUMP 0x00000400 /* don't include in a core */ #define MAP_ENTRY_VN_EXEC 0x00000800 /* text vnode mapping */ #define MAP_ENTRY_GROWS_DOWN 0x00001000 /* top-down stacks */ #define MAP_ENTRY_GROWS_UP 0x00002000 /* bottom-up stacks */ #define MAP_ENTRY_WIRE_SKIPPED 0x00004000 #define MAP_ENTRY_WRITECNT 0x00008000 /* tracked writeable mapping */ #define MAP_ENTRY_GUARD 0x00010000 #define MAP_ENTRY_STACK_GAP_DN 0x00020000 #define MAP_ENTRY_STACK_GAP_UP 0x00040000 #define MAP_ENTRY_HEADER 0x00080000 #define MAP_ENTRY_SPLIT_BOUNDARY_MASK 0x00300000 #define MAP_ENTRY_SPLIT_BOUNDARY_SHIFT 20 #ifdef _KERNEL static __inline u_char vm_map_entry_behavior(vm_map_entry_t entry) { return (entry->eflags & MAP_ENTRY_BEHAV_MASK); } static __inline int vm_map_entry_user_wired_count(vm_map_entry_t entry) { if (entry->eflags & MAP_ENTRY_USER_WIRED) return (1); return (0); } static __inline int vm_map_entry_system_wired_count(vm_map_entry_t entry) { return (entry->wired_count - vm_map_entry_user_wired_count(entry)); } #endif /* _KERNEL */ /* * A map is a set of map entries. These map entries are * organized as a threaded binary search tree. Both structures * are ordered based upon the start and end addresses contained * within each map entry. The largest gap between an entry in a * subtree and one of its neighbors is saved in the max_free * field, and that field is updated when the tree is * restructured. * * Sleator and Tarjan's top-down splay algorithm is employed to * control height imbalance in the binary search tree. * * The map's min offset value is stored in map->header.end, and * its max offset value is stored in map->header.start. These * values act as sentinels for any forward or backward address * scan of the list. The right and left fields of the map * header point to the first and list map entries. The map * header has a special value for the eflags field, * MAP_ENTRY_HEADER, that is set initially, is never changed, * and prevents an eflags match of the header with any other map * entry. * * List of locks * (c) const until freed */ struct vm_map { struct vm_map_entry header; /* List of entries */ struct sx lock; /* Lock for map data */ struct mtx system_mtx; int nentries; /* Number of entries */ vm_size_t size; /* virtual size */ u_int timestamp; /* Version number */ u_char needs_wakeup; u_char system_map; /* (c) Am I a system map? */ vm_flags_t flags; /* flags for this vm_map */ vm_map_entry_t root; /* Root of a binary search tree */ pmap_t pmap; /* (c) Physical map */ vm_offset_t anon_loc; int busy; #ifdef DIAGNOSTIC int nupdates; #endif }; /* * vm_flags_t values */ #define MAP_WIREFUTURE 0x01 /* wire all future pages */ #define MAP_BUSY_WAKEUP 0x02 #define MAP_IS_SUB_MAP 0x04 /* has parent */ #define MAP_ASLR 0x08 /* enabled ASLR */ #define MAP_ASLR_IGNSTART 0x10 #ifdef _KERNEL #if defined(KLD_MODULE) && !defined(KLD_TIED) #define vm_map_max(map) vm_map_max_KBI((map)) #define vm_map_min(map) vm_map_min_KBI((map)) #define vm_map_pmap(map) vm_map_pmap_KBI((map)) #define vm_map_range_valid(map, start, end) \ vm_map_range_valid_KBI((map), (start), (end)) #else static __inline vm_offset_t vm_map_max(const struct vm_map *map) { return (map->header.start); } static __inline vm_offset_t vm_map_min(const struct vm_map *map) { return (map->header.end); } static __inline pmap_t vm_map_pmap(vm_map_t map) { return (map->pmap); } static __inline void vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear) { map->flags = (map->flags | set) & ~clear; } static inline bool vm_map_range_valid(vm_map_t map, vm_offset_t start, vm_offset_t end) { if (end < start) return (false); if (start < vm_map_min(map) || end > vm_map_max(map)) return (false); return (true); } #endif /* KLD_MODULE */ #endif /* _KERNEL */ /* * Shareable process virtual address space. * * List of locks * (c) const until freed */ struct vmspace { struct vm_map vm_map; /* VM address map */ struct shmmap_state *vm_shm; /* SYS5 shared memory private data XXX */ segsz_t vm_swrss; /* resident set size before last swap */ segsz_t vm_tsize; /* text size (pages) XXX */ segsz_t vm_dsize; /* data size (pages) XXX */ segsz_t vm_ssize; /* stack size (pages) */ caddr_t vm_taddr; /* (c) user virtual address of text */ caddr_t vm_daddr; /* (c) user virtual address of data */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ - volatile int vm_refcnt; /* number of references */ + u_int vm_refcnt; /* number of references */ /* * Keep the PMAP last, so that CPU-specific variations of that * structure on a single architecture don't result in offset * variations of the machine-independent fields in the vmspace. */ struct pmap vm_pmap; /* private physical map */ }; #ifdef _KERNEL static __inline pmap_t vmspace_pmap(struct vmspace *vmspace) { return &vmspace->vm_pmap; } #endif /* _KERNEL */ #ifdef _KERNEL /* * Macros: vm_map_lock, etc. * Function: * Perform locking on the data portion of a map. Note that * these macros mimic procedure calls returning void. The * semicolon is supplied by the user of these macros, not * by the macros themselves. The macros can safely be used * as unbraced elements in a higher level statement. */ void _vm_map_lock(vm_map_t map, const char *file, int line); void _vm_map_unlock(vm_map_t map, const char *file, int line); int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line); void _vm_map_lock_read(vm_map_t map, const char *file, int line); void _vm_map_unlock_read(vm_map_t map, const char *file, int line); int _vm_map_trylock(vm_map_t map, const char *file, int line); int _vm_map_trylock_read(vm_map_t map, const char *file, int line); int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line); void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line); int vm_map_locked(vm_map_t map); void vm_map_wakeup(vm_map_t map); void vm_map_busy(vm_map_t map); void vm_map_unbusy(vm_map_t map); void vm_map_wait_busy(vm_map_t map); vm_offset_t vm_map_max_KBI(const struct vm_map *map); vm_offset_t vm_map_min_KBI(const struct vm_map *map); pmap_t vm_map_pmap_KBI(vm_map_t map); bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end); #define vm_map_lock(map) _vm_map_lock(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock(map) _vm_map_unlock(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock_and_wait(map, timo) \ _vm_map_unlock_and_wait(map, timo, LOCK_FILE, LOCK_LINE) #define vm_map_lock_read(map) _vm_map_lock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock_read(map) _vm_map_unlock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_trylock(map) _vm_map_trylock(map, LOCK_FILE, LOCK_LINE) #define vm_map_trylock_read(map) \ _vm_map_trylock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_upgrade(map) \ _vm_map_lock_upgrade(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_downgrade(map) \ _vm_map_lock_downgrade(map, LOCK_FILE, LOCK_LINE) long vmspace_resident_count(struct vmspace *vmspace); #endif /* _KERNEL */ /* * Copy-on-write flags for vm_map operations */ #define MAP_INHERIT_SHARE 0x00000001 #define MAP_COPY_ON_WRITE 0x00000002 #define MAP_NOFAULT 0x00000004 #define MAP_PREFAULT 0x00000008 #define MAP_PREFAULT_PARTIAL 0x00000010 #define MAP_DISABLE_SYNCER 0x00000020 #define MAP_CHECK_EXCL 0x00000040 #define MAP_CREATE_GUARD 0x00000080 #define MAP_DISABLE_COREDUMP 0x00000100 #define MAP_PREFAULT_MADVISE 0x00000200 /* from (user) madvise request */ #define MAP_WRITECOUNT 0x00000400 #define MAP_REMAP 0x00000800 #define MAP_STACK_GROWS_DOWN 0x00001000 #define MAP_STACK_GROWS_UP 0x00002000 #define MAP_ACC_CHARGED 0x00004000 #define MAP_ACC_NO_CHARGE 0x00008000 #define MAP_CREATE_STACK_GAP_UP 0x00010000 #define MAP_CREATE_STACK_GAP_DN 0x00020000 #define MAP_VN_EXEC 0x00040000 #define MAP_SPLIT_BOUNDARY_MASK 0x00180000 #define MAP_SPLIT_BOUNDARY_SHIFT 19 /* * vm_fault option flags */ #define VM_FAULT_NORMAL 0x00 /* Nothing special */ #define VM_FAULT_WIRE 0x01 /* Wire the mapped page */ #define VM_FAULT_DIRTY 0x02 /* Dirty the page; use w/VM_PROT_COPY */ #define VM_FAULT_NOFILL 0x04 /* Fail if the pager doesn't have a copy */ /* * Initially, mappings are slightly sequential. The maximum window size must * account for the map entry's "read_ahead" field being defined as an uint8_t. */ #define VM_FAULT_READ_AHEAD_MIN 7 #define VM_FAULT_READ_AHEAD_INIT 15 #define VM_FAULT_READ_AHEAD_MAX min(atop(MAXPHYS) - 1, UINT8_MAX) /* * The following "find_space" options are supported by vm_map_find(). * * For VMFS_ALIGNED_SPACE, the desired alignment is specified to * the macro argument as log base 2 of the desired alignment. */ #define VMFS_NO_SPACE 0 /* don't find; use the given range */ #define VMFS_ANY_SPACE 1 /* find a range with any alignment */ #define VMFS_OPTIMAL_SPACE 2 /* find a range with optimal alignment*/ #define VMFS_SUPER_SPACE 3 /* find a superpage-aligned range */ #define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find a range with fixed alignment */ /* * vm_map_wire and vm_map_unwire option flags */ #define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ #define VM_MAP_WIRE_USER 1 /* wiring in a user map */ #define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ #define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ #define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ typedef int vm_map_entry_reader(void *token, vm_map_entry_t addr, vm_map_entry_t dest); #ifndef _KERNEL /* * Find the successor of a map_entry, using a reader to dereference pointers. * '*clone' is a copy of a vm_map entry. 'reader' is used to copy a map entry * at some address into '*clone'. Change *clone to a copy of the next map * entry, and return the address of that entry, or NULL if copying has failed. * * This function is made available to user-space code that needs to traverse * map entries. */ static inline vm_map_entry_t vm_map_entry_read_succ(void *token, struct vm_map_entry *const clone, vm_map_entry_reader reader) { vm_map_entry_t after, backup; vm_offset_t start; after = clone->right; start = clone->start; if (!reader(token, after, clone)) return (NULL); backup = clone->left; if (!reader(token, backup, clone)) return (NULL); if (clone->start > start) { do { after = backup; backup = clone->left; if (!reader(token, backup, clone)) return (NULL); } while (clone->start != start); } if (!reader(token, after, clone)) return (NULL); return (after); } #endif /* ! _KERNEL */ #ifdef _KERNEL boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t); int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment); int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t); int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t); void vm_map_init(vm_map_t, pmap_t, vm_offset_t, vm_offset_t); int vm_map_insert (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int); int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *); int vm_map_lookup_locked(vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *); void vm_map_lookup_done (vm_map_t, vm_map_entry_t); boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *); static inline vm_map_entry_t vm_map_entry_first(vm_map_t map) { return (map->header.right); } static inline vm_map_entry_t vm_map_entry_succ(vm_map_entry_t entry) { vm_map_entry_t after; after = entry->right; if (after->left->start > entry->start) { do after = after->left; while (after->left != entry); } return (after); } #define VM_MAP_ENTRY_FOREACH(it, map) \ for ((it) = vm_map_entry_first(map); \ (it) != &(map)->header; \ (it) = vm_map_entry_succ(it)) int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t); int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t); void vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev, vm_map_entry_t entry); void vm_map_startup (void); int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t); int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int); int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); long vmspace_swap_count(struct vmspace *vmspace); void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add); #endif /* _KERNEL */ #endif /* _VM_MAP_ */