Index: head/sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- head/sys/fs/tmpfs/tmpfs_subr.c (revision 354868) +++ head/sys/fs/tmpfs/tmpfs_subr.c (revision 354869) @@ -1,1873 +1,1872 @@ /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system supporting functions. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "tmpfs file system"); static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; static int sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) { int error; long pages, bytes; pages = *(long *)arg1; bytes = pages * PAGE_SIZE; error = sysctl_handle_long(oidp, &bytes, 0, req); if (error || !req->newptr) return (error); pages = bytes / PAGE_SIZE; if (pages < TMPFS_PAGES_MINRESERVED) return (EINVAL); *(long *)arg1 = pages; return (0); } SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, CTLTYPE_LONG|CTLFLAG_RW, &tmpfs_pages_reserved, 0, sysctl_mem_reserved, "L", "Amount of available memory and swap below which tmpfs growth stops"); static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b); RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); size_t tmpfs_mem_avail(void) { vm_ooffset_t avail; avail = swap_pager_avail + vm_free_count() - tmpfs_pages_reserved; if (__predict_false(avail < 0)) avail = 0; return (avail); } size_t tmpfs_pages_used(struct tmpfs_mount *tmp) { const size_t node_size = sizeof(struct tmpfs_node) + sizeof(struct tmpfs_dirent); size_t meta_pages; meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, PAGE_SIZE); return (meta_pages + tmp->tm_pages_used); } static size_t tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) { if (tmpfs_mem_avail() < req_pages) return (0); if (tmp->tm_pages_max != ULONG_MAX && tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) return (0); return (1); } void tmpfs_ref_node(struct tmpfs_node *node) { TMPFS_NODE_LOCK(node); tmpfs_ref_node_locked(node); TMPFS_NODE_UNLOCK(node); } void tmpfs_ref_node_locked(struct tmpfs_node *node) { TMPFS_NODE_ASSERT_LOCKED(node); KASSERT(node->tn_refcount > 0, ("node %p zero refcount", node)); KASSERT(node->tn_refcount < UINT_MAX, ("node %p refcount %u", node, node->tn_refcount)); node->tn_refcount++; } /* * Allocates a new node of type 'type' inside the 'tmp' mount point, with * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', * using the credentials of the process 'p'. * * If the node type is set to 'VDIR', then the parent parameter must point * to the parent directory of the node being created. It may only be NULL * while allocating the root node. * * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter * specifies the device the node represents. * * If the node type is set to 'VLNK', then the parameter target specifies * the file name of the target file for the symbolic link that is being * created. * * Note that new nodes are retrieved from the available list if it has * items or, if it is empty, from the node pool as long as there is enough * space to create them. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type, uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, const char *target, dev_t rdev, struct tmpfs_node **node) { struct tmpfs_node *nnode; vm_object_t obj; /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) return (ENOSPC); if (tmpfs_pages_check_avail(tmp, 1) == 0) return (ENOSPC); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { /* * When a new tmpfs node is created for fully * constructed mount point, there must be a parent * node, which vnode is locked exclusively. As * consequence, if the unmount is executing in * parallel, vflush() cannot reclaim the parent vnode. * Due to this, the check for MNTK_UNMOUNT flag is not * racy: if we did not see MNTK_UNMOUNT flag, then tmp * cannot be destroyed until node construction is * finished and the parent vnode unlocked. * * Tmpfs does not need to instantiate new nodes during * unmount. */ return (EBUSY); } if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) return (EROFS); nnode = (struct tmpfs_node *)uma_zalloc_arg(tmp->tm_node_pool, tmp, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; vfs_timestamp(&nnode->tn_atime); nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = nnode->tn_atime; nnode->tn_uid = uid; nnode->tn_gid = gid; nnode->tn_mode = mode; nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); nnode->tn_refcount = 1; /* Type-specific initialization. */ switch (nnode->tn_type) { case VBLK: case VCHR: nnode->tn_rdev = rdev; break; case VDIR: RB_INIT(&nnode->tn_dir.tn_dirhead); LIST_INIT(&nnode->tn_dir.tn_dupindex); MPASS(parent != nnode); MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; nnode->tn_dir.tn_readdir_lastn = 0; nnode->tn_dir.tn_readdir_lastp = NULL; nnode->tn_links++; TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); nnode->tn_dir.tn_parent->tn_links++; TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); break; case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: MPASS(strlen(target) < MAXPATHLEN); nnode->tn_size = strlen(target); nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME, M_WAITOK); memcpy(nnode->tn_link, target, nnode->tn_size); break; case VREG: obj = nnode->tn_reg.tn_aobj = vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0, NULL /* XXXKIB - tmpfs needs swap reservation */); VM_OBJECT_WLOCK(obj); /* OBJ_TMPFS is set together with the setting of vp->v_object */ - vm_object_set_flag(obj, OBJ_NOSPLIT | OBJ_TMPFS_NODE); - vm_object_clear_flag(obj, OBJ_ONEMAPPING); + vm_object_set_flag(obj, OBJ_TMPFS_NODE); VM_OBJECT_WUNLOCK(obj); break; default: panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type); } TMPFS_LOCK(tmp); LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); nnode->tn_attached = true; tmp->tm_nodes_inuse++; tmp->tm_refcount++; TMPFS_UNLOCK(tmp); *node = nnode; return (0); } /* * Destroys the node pointed to by node from the file system 'tmp'. * If the node references a directory, no entries are allowed. */ void tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) { TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(node); if (!tmpfs_free_node_locked(tmp, node, false)) { TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); } } bool tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, bool detach) { vm_object_t uobj; TMPFS_MP_ASSERT_LOCKED(tmp); TMPFS_NODE_ASSERT_LOCKED(node); KASSERT(node->tn_refcount > 0, ("node %p refcount zero", node)); node->tn_refcount--; if (node->tn_attached && (detach || node->tn_refcount == 0)) { MPASS(tmp->tm_nodes_inuse > 0); tmp->tm_nodes_inuse--; LIST_REMOVE(node, tn_entries); node->tn_attached = false; } if (node->tn_refcount > 0) return (false); #ifdef INVARIANTS MPASS(node->tn_vnode == NULL); MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); #endif TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VDIR: /* FALLTHROUGH */ case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: free(node->tn_link, M_TMPFSNAME); break; case VREG: uobj = node->tn_reg.tn_aobj; if (uobj != NULL) { if (uobj->size != 0) atomic_subtract_long(&tmp->tm_pages_used, uobj->size); KASSERT((uobj->flags & OBJ_TMPFS) == 0, ("leaked OBJ_TMPFS node %p vm_obj %p", node, uobj)); vm_object_deallocate(uobj); } break; default: panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } uma_zfree(tmp->tm_node_pool, node); TMPFS_LOCK(tmp); tmpfs_free_tmp(tmp); return (true); } static __inline uint32_t tmpfs_dirent_hash(const char *name, u_int len) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP hash &= 0xf; #endif if (hash < TMPFS_DIRCOOKIE_MIN) hash += TMPFS_DIRCOOKIE_MIN; return (hash); } static __inline off_t tmpfs_dirent_cookie(struct tmpfs_dirent *de) { if (de == NULL) return (TMPFS_DIRCOOKIE_EOF); MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); return (de->td_cookie); } static __inline boolean_t tmpfs_dirent_dup(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); } static __inline boolean_t tmpfs_dirent_duphead(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); } void tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) { de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); memcpy(de->ud.td_name, name, namelen); de->td_namelen = namelen; } /* * Allocates a new directory entry for the node node with a name of name. * The new directory entry is returned in *de. * * The link count of node is increased by one to reflect the new object * referencing it. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, const char *name, u_int len, struct tmpfs_dirent **de) { struct tmpfs_dirent *nde; nde = uma_zalloc(tmp->tm_dirent_pool, M_WAITOK); nde->td_node = node; if (name != NULL) { nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); tmpfs_dirent_init(nde, name, len); } else nde->td_namelen = 0; if (node != NULL) node->tn_links++; *de = nde; return 0; } /* * Frees a directory entry. It is the caller's responsibility to destroy * the node referenced by it if needed. * * The link count of node is decreased by one to reflect the removal of an * object that referenced it. This only happens if 'node_exists' is true; * otherwise the function will not access the node referred to by the * directory entry, as it may already have been released from the outside. */ void tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) { struct tmpfs_node *node; node = de->td_node; if (node != NULL) { MPASS(node->tn_links > 0); node->tn_links--; } if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) free(de->ud.td_name, M_TMPFSNAME); uma_zfree(tmp->tm_dirent_pool, de); } void tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) { ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); if (vp->v_type != VREG || obj == NULL) return; VM_OBJECT_WLOCK(obj); VI_LOCK(vp); vm_object_clear_flag(obj, OBJ_TMPFS); obj->un_pager.swp.swp_tmpfs = NULL; if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(obj); } /* * Need to clear v_object for insmntque failure. */ static void tmpfs_insmntque_dtr(struct vnode *vp, void *dtr_arg) { tmpfs_destroy_vobject(vp, vp->v_object); vp->v_object = NULL; vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, struct vnode **vpp) { struct vnode *vp; struct tmpfs_mount *tm; vm_object_t object; int error; error = 0; tm = VFS_TO_TMPFS(mp); TMPFS_NODE_LOCK(node); tmpfs_ref_node_locked(node); loop: TMPFS_NODE_ASSERT_LOCKED(node); if ((vp = node->tn_vnode) != NULL) { MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); VI_LOCK(vp); if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || ((vp->v_iflag & VI_DOOMED) != 0 && (lkflag & LK_NOWAIT) != 0)) { VI_UNLOCK(vp); TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } if ((vp->v_iflag & VI_DOOMED) != 0) { VI_UNLOCK(vp); node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 0, "tmpfsE", 0); } goto loop; } TMPFS_NODE_UNLOCK(node); error = vget(vp, lkflag | LK_INTERLOCK, curthread); if (error == ENOENT) { TMPFS_NODE_LOCK(node); goto loop; } if (error != 0) { vp = NULL; goto out; } /* * Make sure the vnode is still there after * getting the interlock to avoid racing a free. */ if (node->tn_vnode == NULL || node->tn_vnode != vp) { vput(vp); TMPFS_NODE_LOCK(node); goto loop; } goto out; } if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } /* * otherwise lock the vp list while we call getnewvnode * since that can block. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = msleep((caddr_t) &node->tn_vpstate, TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); if (error != 0) goto out; goto loop; } else node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* Get a new vnode and associate it with our node. */ error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); if (error != 0) goto unlock; MPASS(vp != NULL); /* lkflag is ignored, the lock is exclusive */ (void) vn_lock(vp, lkflag | LK_RETRY); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VSOCK: break; case VFIFO: vp->v_op = &tmpfs_fifoop_entries; break; case VREG: object = node->tn_reg.tn_aobj; VM_OBJECT_WLOCK(object); VI_LOCK(vp); KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); vp->v_object = object; object->un_pager.swp.swp_tmpfs = vp; vm_object_set_flag(object, OBJ_TMPFS); VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); break; case VDIR: MPASS(node->tn_dir.tn_parent != NULL); if (node->tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT; break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); error = insmntque1(vp, mp, tmpfs_insmntque_dtr, NULL); if (error != 0) vp = NULL; unlock: TMPFS_NODE_LOCK(node); MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup((caddr_t) &node->tn_vpstate); } else TMPFS_NODE_UNLOCK(node); out: if (error == 0) { *vpp = vp; #ifdef INVARIANTS MPASS(*vpp != NULL && VOP_ISLOCKED(*vpp)); TMPFS_NODE_LOCK(node); MPASS(*vpp == node->tn_vnode); TMPFS_NODE_UNLOCK(node); #endif } tmpfs_free_node(tm, node); return (error); } /* * Destroys the association between the vnode vp and the node it * references. */ void tmpfs_free_vp(struct vnode *vp) { struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); TMPFS_NODE_ASSERT_LOCKED(node); node->tn_vnode = NULL; if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) wakeup(&node->tn_vnode); node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; vp->v_data = NULL; } /* * Allocates a new file of type 'type' and adds it to the parent directory * 'dvp'; this addition is done using the component name given in 'cnp'. * The ownership of the new file is automatically assigned based on the * credentials of the caller (through 'cnp'), the group is set based on * the parent directory and the mode is determined from the 'vap' argument. * If successful, *vpp holds a vnode to the newly created file and zero * is returned. Otherwise *vpp is NULL and the function returns an * appropriate error code. */ int tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, struct componentname *cnp, const char *target) { int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; struct tmpfs_node *parent; ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); MPASS(cnp->cn_flags & HASBUF); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULL; /* If the entry we are creating is a directory, we cannot overflow * the number of links of its parent, because it will get a new * link. */ if (vap->va_type == VDIR) { /* Ensure that we do not overflow the maximum number of links * imposed by the system. */ MPASS(dnode->tn_links <= TMPFS_LINK_MAX); if (dnode->tn_links == TMPFS_LINK_MAX) { return (EMLINK); } parent = dnode; MPASS(parent != NULL); } else parent = NULL; /* Allocate a node that represents the new file. */ error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev, &node); if (error != 0) return (error); /* Allocate a directory entry that points to the new file. */ error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) { tmpfs_free_node(tmp, node); return (error); } /* Allocate a vnode for the new file. */ error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); if (error != 0) { tmpfs_free_dirent(tmp, de); tmpfs_free_node(tmp, node); return (error); } /* Now that all required items are allocated, we can proceed to * insert the new node into the directory, an operation that * cannot fail. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); return (0); } struct tmpfs_dirent * tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); dc->tdc_tree = de; if (de != NULL && tmpfs_dirent_duphead(de)) de = LIST_FIRST(&de->ud.td_duphead); dc->tdc_current = de; return (dc->tdc_current); } struct tmpfs_dirent * tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; MPASS(dc->tdc_tree != NULL); if (tmpfs_dirent_dup(dc->tdc_current)) { dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); if (dc->tdc_current != NULL) return (dc->tdc_current); } dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, dc->tdc_tree); if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); } /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ static struct tmpfs_dirent * tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) { struct tmpfs_dirent *de, dekey; dekey.td_hash = hash; de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); return (de); } /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ static struct tmpfs_dirent * tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, struct tmpfs_dir_cursor *dc) { struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; struct tmpfs_dirent *de, dekey; MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); if (cookie == node->tn_dir.tn_readdir_lastn && (de = node->tn_dir.tn_readdir_lastp) != NULL) { /* Protect against possible race, tn_readdir_last[pn] * may be updated with only shared vnode lock held. */ if (cookie == tmpfs_dirent_cookie(de)) goto out; } if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { LIST_FOREACH(de, &node->tn_dir.tn_dupindex, uh.td_dup.index_entries) { MPASS(tmpfs_dirent_dup(de)); if (de->td_cookie == cookie) goto out; /* dupindex list is sorted. */ if (de->td_cookie < cookie) { de = NULL; goto out; } } MPASS(de == NULL); goto out; } if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { de = NULL; } else { dekey.td_hash = cookie; /* Recover if direntry for cookie was removed */ de = RB_NFIND(tmpfs_dir, dirhead, &dekey); } dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); out: dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_dup(de)) dc->tdc_tree = tmpfs_dir_xlookup_hash(node, de->td_hash); return (dc->tdc_current); } /* * Looks for a directory entry in the directory represented by node. * 'cnp' describes the name of the entry to look for. Note that the . * and .. components are not allowed as they do not physically exist * within directories. * * Returns a pointer to the entry when found, otherwise NULL. */ struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, struct componentname *cnp) { struct tmpfs_dir_duphead *duphead; struct tmpfs_dirent *de; uint32_t hash; MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.'))); TMPFS_VALIDATE_DIR(node); hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); de = tmpfs_dir_xlookup_hash(node, hash); if (de != NULL && tmpfs_dirent_duphead(de)) { duphead = &de->ud.td_duphead; LIST_FOREACH(de, duphead, uh.td_dup.entries) { if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) break; } } else if (de != NULL) { if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) de = NULL; } if (de != NULL && f != NULL && de->td_node != f) de = NULL; return (de); } /* * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex * list, allocate new cookie value. */ static void tmpfs_dir_attach_dup(struct tmpfs_node *dnode, struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) { struct tmpfs_dir_duphead *dupindex; struct tmpfs_dirent *de, *pde; dupindex = &dnode->tn_dir.tn_dupindex; de = LIST_FIRST(dupindex); if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { if (de == NULL) nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; else nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } /* * Cookie numbers are near exhaustion. Scan dupindex list for unused * numbers. dupindex list is sorted in descending order. Keep it so * after inserting nde. */ while (1) { pde = de; de = LIST_NEXT(de, uh.td_dup.index_entries); if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { /* * Last element of the index doesn't have minimal cookie * value, use it. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } else if (de == NULL) { /* * We are so lucky have 2^30 hash duplicates in single * directory :) Return largest possible cookie value. * It should be fine except possible issues with * VOP_READDIR restart. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } if (de->td_cookie + 1 == pde->td_cookie || de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) continue; /* No hole or invalid cookie. */ nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); MPASS(pde->td_cookie > nde->td_cookie); MPASS(nde->td_cookie > de->td_cookie); LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } } /* * Attaches the directory entry de to the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_alloc_dirent. */ void tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; struct tmpfs_dirent *xde, *nde; ASSERT_VOP_ELOCKED(vp, __func__); MPASS(de->td_namelen > 0); MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); MPASS(de->td_cookie == de->td_hash); dnode = VP_TO_TMPFS_DIR(vp); dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; MPASS(!tmpfs_dirent_dup(de)); xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); if (xde != NULL && tmpfs_dirent_duphead(xde)) tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); else if (xde != NULL) { /* * Allocate new duphead. Swap xde with duphead to avoid * adding/removing elements with the same hash. */ MPASS(!tmpfs_dirent_dup(xde)); tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, &nde); /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ memcpy(nde, xde, sizeof(*xde)); xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; LIST_INIT(&xde->ud.td_duphead); xde->td_namelen = 0; xde->td_node = NULL; tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); } dnode->tn_size += sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; tmpfs_update(vp); } /* * Detaches the directory entry de from the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_free_dirent. */ void tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_mount *tmp; struct tmpfs_dir *head; struct tmpfs_node *dnode; struct tmpfs_dirent *xde; ASSERT_VOP_ELOCKED(vp, __func__); dnode = VP_TO_TMPFS_DIR(vp); head = &dnode->tn_dir.tn_dirhead; dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; if (tmpfs_dirent_dup(de)) { /* Remove duphead if de was last entry. */ if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); MPASS(tmpfs_dirent_duphead(xde)); } else xde = NULL; LIST_REMOVE(de, uh.td_dup.entries); LIST_REMOVE(de, uh.td_dup.index_entries); if (xde != NULL) { if (LIST_EMPTY(&xde->ud.td_duphead)) { RB_REMOVE(tmpfs_dir, head, xde); tmp = VFS_TO_TMPFS(vp->v_mount); MPASS(xde->td_node == NULL); tmpfs_free_dirent(tmp, xde); } } de->td_cookie = de->td_hash; } else RB_REMOVE(tmpfs_dir, head, de); dnode->tn_size -= sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; tmpfs_update(vp); } void tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) { struct tmpfs_dirent *de, *dde, *nde; RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); /* Node may already be destroyed. */ de->td_node = NULL; if (tmpfs_dirent_duphead(de)) { while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { LIST_REMOVE(dde, uh.td_dup.entries); dde->td_node = NULL; tmpfs_free_dirent(tmp, dde); } } tmpfs_free_dirent(tmp, de); } } /* * Helper function for tmpfs_readdir. Creates a '.' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); dent.d_fileno = node->tn_id; dent.d_type = DT_DIR; dent.d_namlen = 1; dent.d_name[0] = '.'; dent.d_reclen = GENERIC_DIRSIZ(&dent); dirent_terminate(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED); return (error); } /* * Helper function for tmpfs_readdir. Creates a '..' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio) { struct tmpfs_node *parent; struct dirent dent; int error; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); /* * Return ENOENT if the current node is already removed. */ TMPFS_ASSERT_LOCKED(node); parent = node->tn_dir.tn_parent; if (parent == NULL) return (ENOENT); TMPFS_NODE_LOCK(parent); dent.d_fileno = parent->tn_id; TMPFS_NODE_UNLOCK(parent); dent.d_type = DT_DIR; dent.d_namlen = 2; dent.d_name[0] = '.'; dent.d_name[1] = '.'; dent.d_reclen = GENERIC_DIRSIZ(&dent); dirent_terminate(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED); return (error); } /* * Helper function for tmpfs_readdir. Returns as much directory entries * as can fit in the uio space. The read starts at uio->uio_offset. * The function returns 0 on success, -1 if there was not enough space * in the uio structure to hold the directory entry or an appropriate * error code if another error happens. */ int tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio, int maxcookies, u_long *cookies, int *ncookies) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; off_t off; int error; TMPFS_VALIDATE_DIR(node); off = 0; /* * Lookup the node from the current offset. The starting offset of * 0 will lookup both '.' and '..', and then the first real entry, * or EOF if there are none. Then find all entries for the dir that * fit into the buffer. Once no more entries are found (de == NULL), * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next * call to return 0. */ switch (uio->uio_offset) { case TMPFS_DIRCOOKIE_DOT: error = tmpfs_dir_getdotdent(tm, node, uio); if (error != 0) return (error); uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; if (cookies != NULL) cookies[(*ncookies)++] = off = uio->uio_offset; /* FALLTHROUGH */ case TMPFS_DIRCOOKIE_DOTDOT: error = tmpfs_dir_getdotdotdent(tm, node, uio); if (error != 0) return (error); de = tmpfs_dir_first(node, &dc); uio->uio_offset = tmpfs_dirent_cookie(de); if (cookies != NULL) cookies[(*ncookies)++] = off = uio->uio_offset; /* EOF. */ if (de == NULL) return (0); break; case TMPFS_DIRCOOKIE_EOF: return (0); default: de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); if (de == NULL) return (EINVAL); if (cookies != NULL) off = tmpfs_dirent_cookie(de); } /* Read as much entries as possible; i.e., until we reach the end of * the directory or we exhaust uio space. */ do { struct dirent d; /* Create a dirent structure representing the current * tmpfs_node and fill it. */ if (de->td_node == NULL) { d.d_fileno = 1; d.d_type = DT_WHT; } else { d.d_fileno = de->td_node->tn_id; switch (de->td_node->tn_type) { case VBLK: d.d_type = DT_BLK; break; case VCHR: d.d_type = DT_CHR; break; case VDIR: d.d_type = DT_DIR; break; case VFIFO: d.d_type = DT_FIFO; break; case VLNK: d.d_type = DT_LNK; break; case VREG: d.d_type = DT_REG; break; case VSOCK: d.d_type = DT_SOCK; break; default: panic("tmpfs_dir_getdents: type %p %d", de->td_node, (int)de->td_node->tn_type); } } d.d_namlen = de->td_namelen; MPASS(de->td_namelen < sizeof(d.d_name)); (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); d.d_reclen = GENERIC_DIRSIZ(&d); dirent_terminate(&d); /* Stop reading if the directory entry we are treating is * bigger than the amount of data that can be returned. */ if (d.d_reclen > uio->uio_resid) { error = EJUSTRETURN; break; } /* Copy the new dirent structure into the output buffer and * advance pointers. */ error = uiomove(&d, d.d_reclen, uio); if (error == 0) { de = tmpfs_dir_next(node, &dc); if (cookies != NULL) { off = tmpfs_dirent_cookie(de); MPASS(*ncookies < maxcookies); cookies[(*ncookies)++] = off; } } } while (error == 0 && uio->uio_resid > 0 && de != NULL); /* Skip setting off when using cookies as it is already done above. */ if (cookies == NULL) off = tmpfs_dirent_cookie(de); /* Update the offset and cache. */ uio->uio_offset = off; node->tn_dir.tn_readdir_lastn = off; node->tn_dir.tn_readdir_lastp = de; tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED); return error; } int tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; int error; error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) return (error); tmpfs_dir_attach(dvp, de); return (0); } void tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); MPASS(de != NULL && de->td_node == NULL); tmpfs_dir_detach(dvp, de); tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); } /* * Resizes the aobj associated with the regular file pointed to by 'vp' to the * size 'newsize'. 'vp' must point to a vnode that represents a regular file. * 'newsize' must be positive. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) { struct tmpfs_mount *tmp; struct tmpfs_node *node; vm_object_t uobj; vm_page_t m; vm_pindex_t idx, newpages, oldpages; off_t oldsize; int base, rv; MPASS(vp->v_type == VREG); MPASS(newsize >= 0); node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_reg.tn_aobj; tmp = VFS_TO_TMPFS(vp->v_mount); /* * Convert the old and new sizes to the number of pages needed to * store them. It may happen that we do not need to do anything * because the last allocated page can accommodate the change on * its own. */ oldsize = node->tn_size; oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); MPASS(oldpages == uobj->size); newpages = OFF_TO_IDX(newsize + PAGE_MASK); if (__predict_true(newpages == oldpages && newsize >= oldsize)) { node->tn_size = newsize; return (0); } if (newpages > oldpages && tmpfs_pages_check_avail(tmp, newpages - oldpages) == 0) return (ENOSPC); VM_OBJECT_WLOCK(uobj); if (newsize < oldsize) { /* * Zero the truncated part of the last page. */ base = newsize & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(newsize); retry: m = vm_page_grab(uobj, idx, VM_ALLOC_NOCREAT); if (m != NULL) { MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(uobj, idx, NULL, NULL)) { m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; rv = vm_pager_get_pages(uobj, &m, 1, NULL, NULL); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, * and therefore not recently * accessed, immediately enqueue it * for asynchronous laundering. The * current operation is not regarded * as an access. */ vm_page_lock(m); vm_page_launder(m); vm_page_unlock(m); } else { vm_page_free(m); if (ignerr) m = NULL; else { VM_OBJECT_WUNLOCK(uobj); return (EIO); } } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); vm_page_dirty(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); } } /* * Release any swap space and free any whole pages. */ if (newpages < oldpages) { swap_pager_freespace(uobj, newpages, oldpages - newpages); vm_object_page_remove(uobj, newpages, 0, 0); } } uobj->size = newpages; VM_OBJECT_WUNLOCK(uobj); atomic_add_long(&tmp->tm_pages_used, newpages - oldpages); node->tn_size = newsize; return (0); } void tmpfs_check_mtime(struct vnode *vp) { struct tmpfs_node *node; struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "check_mtime"); if (vp->v_type != VREG) return; obj = vp->v_object; KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) == (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj")); /* unlocked read */ if (obj->generation != obj->cleangeneration) { VM_OBJECT_WLOCK(obj); if (obj->generation != obj->cleangeneration) { obj->cleangeneration = obj->generation; node = VP_TO_TMPFS_NODE(vp); node->tn_status |= TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; } VM_OBJECT_WUNLOCK(obj); } } /* * Change flags of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chflags"); node = VP_TO_TMPFS_NODE(vp); if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((flags ^ node->tn_flags) & SF_SETTABLE)) return (EPERM); } node->tn_flags = flags; node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chflags2"); return (0); } /* * Change access mode on the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chmod"); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } node->tn_mode &= ~ALLPERMS; node->tn_mode |= mode & ALLPERMS; node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chmod2"); return (0); } /* * Change ownership of the given vnode. At least one of uid or gid must * be different than VNOVAL. If one is set to that value, the attribute * is unchanged. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; uid_t ouid; gid_t ogid; ASSERT_VOP_ELOCKED(vp, "chown"); node = VP_TO_TMPFS_NODE(vp); /* Assign default values if they are unknown. */ MPASS(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) uid = node->tn_uid; if (gid == VNOVAL) gid = node->tn_gid; MPASS(uid != VNOVAL && gid != VNOVAL); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if ((uid != node->tn_uid || (gid != node->tn_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = node->tn_gid; ouid = node->tn_uid; node->tn_uid = uid; node->tn_gid = gid; node->tn_status |= TMPFS_NODE_CHANGED; if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) node->tn_mode &= ~(S_ISUID | S_ISGID); } ASSERT_VOP_ELOCKED(vp, "chown2"); return (0); } /* * Change size of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chsize"); node = VP_TO_TMPFS_NODE(vp); /* Decide whether this is a valid operation based on the file type. */ error = 0; switch (vp->v_type) { case VDIR: return EISDIR; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VFIFO: /* Allow modifications of special files even if in the file * system is mounted read-only (we are not modifying the * files themselves, but the objects they represent). */ return 0; default: /* Anything else is unsupported. */ return EOPNOTSUPP; } /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = tmpfs_truncate(vp, size); /* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents * for us, as will update tn_status; no need to do that here. */ ASSERT_VOP_ELOCKED(vp, "chsize2"); return (error); } /* * Change access and modification times of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chtimes(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *l) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chtimes"); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = vn_utimes_perm(vp, vap, cred, l); if (error != 0) return (error); if (vap->va_atime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_ACCESSED; if (vap->va_mtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_birthtime = vap->va_birthtime; ASSERT_VOP_ELOCKED(vp, "chtimes2"); return (0); } void tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) { if ((node->tn_status & status) == status || tm->tm_ronly) return; TMPFS_NODE_LOCK(node); node->tn_status |= status; TMPFS_NODE_UNLOCK(node); } /* Sync timestamps */ void tmpfs_itimes(struct vnode *vp, const struct timespec *acc, const struct timespec *mod) { struct tmpfs_node *node; struct timespec now; ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); node = VP_TO_TMPFS_NODE(vp); if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) return; vfs_timestamp(&now); TMPFS_NODE_LOCK(node); if (node->tn_status & TMPFS_NODE_ACCESSED) { if (acc == NULL) acc = &now; node->tn_atime = *acc; } if (node->tn_status & TMPFS_NODE_MODIFIED) { if (mod == NULL) mod = &now; node->tn_mtime = *mod; } if (node->tn_status & TMPFS_NODE_CHANGED) node->tn_ctime = now; node->tn_status &= ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); TMPFS_NODE_UNLOCK(node); /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); } void tmpfs_update(struct vnode *vp) { tmpfs_itimes(vp, NULL, NULL); } int tmpfs_truncate(struct vnode *vp, off_t length) { int error; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); if (length < 0) { error = EINVAL; goto out; } if (node->tn_size == length) { error = 0; goto out; } if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); error = tmpfs_reg_resize(vp, length, FALSE); if (error == 0) node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; out: tmpfs_update(vp); return (error); } static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) { if (a->td_hash > b->td_hash) return (1); else if (a->td_hash < b->td_hash) return (-1); return (0); } RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); Index: head/sys/kern/sysv_shm.c =================================================================== --- head/sys/kern/sysv_shm.c (revision 354868) +++ head/sys/kern/sysv_shm.c (revision 354869) @@ -1,1713 +1,1708 @@ /*- * SPDX-License-Identifier: BSD-4-Clause AND BSD-2-Clause-FreeBSD * * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Adam Glass and Charles * Hannum. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: sysv_shm.c,v 1.39 1997/10/07 10:02:03 drochner Exp $ */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_shm, "System V shared memory segments support"); static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments"); static int shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode); static int shmget_existing(struct thread *td, struct shmget_args *uap, int mode, int segnum); #define SHMSEG_FREE 0x0200 #define SHMSEG_REMOVED 0x0400 #define SHMSEG_ALLOCATED 0x0800 static int shm_last_free, shm_nused, shmalloced; vm_size_t shm_committed; static struct shmid_kernel *shmsegs; static unsigned shm_prison_slot; struct shmmap_state { vm_offset_t va; int shmid; }; static void shm_deallocate_segment(struct shmid_kernel *); static int shm_find_segment_by_key(struct prison *, key_t); static struct shmid_kernel *shm_find_segment(struct prison *, int, bool); static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *); static void shmrealloc(void); static int shminit(void); static int sysvshm_modload(struct module *, int, void *); static int shmunload(void); #ifndef SYSVSHM static void shmexit_myhook(struct vmspace *vm); static void shmfork_myhook(struct proc *p1, struct proc *p2); #endif static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS); static void shm_remove(struct shmid_kernel *, int); static struct prison *shm_find_prison(struct ucred *); static int shm_prison_cansee(struct prison *, struct shmid_kernel *); static int shm_prison_check(void *, void *); static int shm_prison_set(void *, void *); static int shm_prison_get(void *, void *); static int shm_prison_remove(void *, void *); static void shm_prison_cleanup(struct prison *); /* * Tuneable values. */ #ifndef SHMMAXPGS #define SHMMAXPGS 131072 /* Note: sysv shared memory is swap backed. */ #endif #ifndef SHMMAX #define SHMMAX (SHMMAXPGS*PAGE_SIZE) #endif #ifndef SHMMIN #define SHMMIN 1 #endif #ifndef SHMMNI #define SHMMNI 192 #endif #ifndef SHMSEG #define SHMSEG 128 #endif #ifndef SHMALL #define SHMALL (SHMMAXPGS) #endif struct shminfo shminfo = { .shmmax = SHMMAX, .shmmin = SHMMIN, .shmmni = SHMMNI, .shmseg = SHMSEG, .shmall = SHMALL }; static int shm_use_phys; static int shm_allow_removed = 1; SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RWTUN, &shminfo.shmmax, 0, "Maximum shared memory segment size"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RWTUN, &shminfo.shmmin, 0, "Minimum shared memory segment size"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0, "Number of shared memory identifiers"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0, "Number of segments per process"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RWTUN, &shminfo.shmall, 0, "Maximum number of pages available for shared memory"); SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RWTUN, &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core"); SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RWTUN, &shm_allow_removed, 0, "Enable/Disable attachment to attached segments marked for removal"); SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "", "Array of struct shmid_kernel for each potential shared memory segment"); static struct sx sysvshmsx; #define SYSVSHM_LOCK() sx_xlock(&sysvshmsx) #define SYSVSHM_UNLOCK() sx_xunlock(&sysvshmsx) #define SYSVSHM_ASSERT_LOCKED() sx_assert(&sysvshmsx, SA_XLOCKED) static int shm_find_segment_by_key(struct prison *pr, key_t key) { int i; for (i = 0; i < shmalloced; i++) if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) && shmsegs[i].cred != NULL && shmsegs[i].cred->cr_prison == pr && shmsegs[i].u.shm_perm.key == key) return (i); return (-1); } /* * Finds segment either by shmid if is_shmid is true, or by segnum if * is_shmid is false. */ static struct shmid_kernel * shm_find_segment(struct prison *rpr, int arg, bool is_shmid) { struct shmid_kernel *shmseg; int segnum; segnum = is_shmid ? IPCID_TO_IX(arg) : arg; if (segnum < 0 || segnum >= shmalloced) return (NULL); shmseg = &shmsegs[segnum]; if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || (!shm_allow_removed && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) || (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) || shm_prison_cansee(rpr, shmseg) != 0) return (NULL); return (shmseg); } static void shm_deallocate_segment(struct shmid_kernel *shmseg) { vm_size_t size; SYSVSHM_ASSERT_LOCKED(); vm_object_deallocate(shmseg->object); shmseg->object = NULL; size = round_page(shmseg->u.shm_segsz); shm_committed -= btoc(size); shm_nused--; shmseg->u.shm_perm.mode = SHMSEG_FREE; #ifdef MAC mac_sysvshm_cleanup(shmseg); #endif racct_sub_cred(shmseg->cred, RACCT_NSHM, 1); racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size); crfree(shmseg->cred); shmseg->cred = NULL; } static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s) { struct shmid_kernel *shmseg; int segnum, result; vm_size_t size; SYSVSHM_ASSERT_LOCKED(); segnum = IPCID_TO_IX(shmmap_s->shmid); KASSERT(segnum >= 0 && segnum < shmalloced, ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; size = round_page(shmseg->u.shm_segsz); result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size); if (result != KERN_SUCCESS) return (EINVAL); shmmap_s->shmid = -1; shmseg->u.shm_dtime = time_second; if (--shmseg->u.shm_nattch == 0 && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) { shm_deallocate_segment(shmseg); shm_last_free = segnum; } return (0); } static void shm_remove(struct shmid_kernel *shmseg, int segnum) { shmseg->u.shm_perm.key = IPC_PRIVATE; shmseg->u.shm_perm.mode |= SHMSEG_REMOVED; if (shmseg->u.shm_nattch == 0) { shm_deallocate_segment(shmseg); shm_last_free = segnum; } } static struct prison * shm_find_prison(struct ucred *cred) { struct prison *pr, *rpr; pr = cred->cr_prison; prison_lock(pr); rpr = osd_jail_get(pr, shm_prison_slot); prison_unlock(pr); return rpr; } static int shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg) { if (shmseg->cred == NULL || !(rpr == shmseg->cred->cr_prison || prison_ischild(rpr, shmseg->cred->cr_prison))) return (EINVAL); return (0); } static int kern_shmdt_locked(struct thread *td, const void *shmaddr) { struct proc *p = td->td_proc; struct shmmap_state *shmmap_s; #ifdef MAC int error; #endif int i; SYSVSHM_ASSERT_LOCKED(); if (shm_find_prison(td->td_ucred) == NULL) return (ENOSYS); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) return (EINVAL); AUDIT_ARG_SVIPC_ID(shmmap_s->shmid); for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) { if (shmmap_s->shmid != -1 && shmmap_s->va == (vm_offset_t)shmaddr) { break; } } if (i == shminfo.shmseg) return (EINVAL); #ifdef MAC error = mac_sysvshm_check_shmdt(td->td_ucred, &shmsegs[IPCID_TO_IX(shmmap_s->shmid)]); if (error != 0) return (error); #endif return (shm_delete_mapping(p->p_vmspace, shmmap_s)); } #ifndef _SYS_SYSPROTO_H_ struct shmdt_args { const void *shmaddr; }; #endif int sys_shmdt(struct thread *td, struct shmdt_args *uap) { int error; SYSVSHM_LOCK(); error = kern_shmdt_locked(td, uap->shmaddr); SYSVSHM_UNLOCK(); return (error); } static int kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr, int shmflg) { struct prison *rpr; struct proc *p = td->td_proc; struct shmid_kernel *shmseg; struct shmmap_state *shmmap_s; vm_offset_t attach_va; vm_prot_t prot; vm_size_t size; int cow, error, find_space, i, rv; AUDIT_ARG_SVIPC_ID(shmid); AUDIT_ARG_VALUE(shmflg); SYSVSHM_ASSERT_LOCKED(); rpr = shm_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) { shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state), M_SHM, M_WAITOK); for (i = 0; i < shminfo.shmseg; i++) shmmap_s[i].shmid = -1; KASSERT(p->p_vmspace->vm_shm == NULL, ("raced")); p->p_vmspace->vm_shm = shmmap_s; } shmseg = shm_find_segment(rpr, shmid, true); if (shmseg == NULL) return (EINVAL); error = ipcperm(td, &shmseg->u.shm_perm, (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); if (error != 0) return (error); #ifdef MAC error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg); if (error != 0) return (error); #endif for (i = 0; i < shminfo.shmseg; i++) { if (shmmap_s->shmid == -1) break; shmmap_s++; } if (i >= shminfo.shmseg) return (EMFILE); size = round_page(shmseg->u.shm_segsz); prot = VM_PROT_READ; cow = MAP_INHERIT_SHARE | MAP_PREFAULT_PARTIAL; if ((shmflg & SHM_RDONLY) == 0) prot |= VM_PROT_WRITE; if (shmaddr != NULL) { if ((shmflg & SHM_RND) != 0) attach_va = rounddown2((vm_offset_t)shmaddr, SHMLBA); else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) attach_va = (vm_offset_t)shmaddr; else return (EINVAL); if ((shmflg & SHM_REMAP) != 0) cow |= MAP_REMAP; find_space = VMFS_NO_SPACE; } else { /* * This is just a hint to vm_map_find() about where to * put it. */ attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(td, RLIMIT_DATA)); find_space = VMFS_OPTIMAL_SPACE; } vm_object_reference(shmseg->object); rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object, 0, &attach_va, size, 0, find_space, prot, prot, cow); if (rv != KERN_SUCCESS) { vm_object_deallocate(shmseg->object); return (ENOMEM); } shmmap_s->va = attach_va; shmmap_s->shmid = shmid; shmseg->u.shm_lpid = p->p_pid; shmseg->u.shm_atime = time_second; shmseg->u.shm_nattch++; td->td_retval[0] = attach_va; return (error); } int kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg) { int error; SYSVSHM_LOCK(); error = kern_shmat_locked(td, shmid, shmaddr, shmflg); SYSVSHM_UNLOCK(); return (error); } #ifndef _SYS_SYSPROTO_H_ struct shmat_args { int shmid; const void *shmaddr; int shmflg; }; #endif int sys_shmat(struct thread *td, struct shmat_args *uap) { return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg)); } static int kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz) { struct prison *rpr; struct shmid_kernel *shmseg; struct shmid_ds *shmidp; struct shm_info shm_info; int error; SYSVSHM_ASSERT_LOCKED(); rpr = shm_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); AUDIT_ARG_SVIPC_ID(shmid); AUDIT_ARG_SVIPC_CMD(cmd); switch (cmd) { /* * It is possible that kern_shmctl is being called from the Linux ABI * layer, in which case, we will need to implement IPC_INFO. It should * be noted that other shmctl calls will be funneled through here for * Linix binaries as well. * * NB: The Linux ABI layer will convert this data to structure(s) more * consistent with the Linux ABI. */ case IPC_INFO: memcpy(buf, &shminfo, sizeof(shminfo)); if (bufsz) *bufsz = sizeof(shminfo); td->td_retval[0] = shmalloced; return (0); case SHM_INFO: { shm_info.used_ids = shm_nused; shm_info.shm_rss = 0; /*XXX where to get from ? */ shm_info.shm_tot = 0; /*XXX where to get from ? */ shm_info.shm_swp = 0; /*XXX where to get from ? */ shm_info.swap_attempts = 0; /*XXX where to get from ? */ shm_info.swap_successes = 0; /*XXX where to get from ? */ memcpy(buf, &shm_info, sizeof(shm_info)); if (bufsz != NULL) *bufsz = sizeof(shm_info); td->td_retval[0] = shmalloced; return (0); } } shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT); if (shmseg == NULL) return (EINVAL); #ifdef MAC error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd); if (error != 0) return (error); #endif switch (cmd) { case SHM_STAT: case IPC_STAT: shmidp = (struct shmid_ds *)buf; error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error != 0) return (error); memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds)); if (td->td_ucred->cr_prison != shmseg->cred->cr_prison) shmidp->shm_perm.key = IPC_PRIVATE; if (bufsz != NULL) *bufsz = sizeof(struct shmid_ds); if (cmd == SHM_STAT) { td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm); } break; case IPC_SET: shmidp = (struct shmid_ds *)buf; AUDIT_ARG_SVIPC_PERM(&shmidp->shm_perm); error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error != 0) return (error); shmseg->u.shm_perm.uid = shmidp->shm_perm.uid; shmseg->u.shm_perm.gid = shmidp->shm_perm.gid; shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & ~ACCESSPERMS) | (shmidp->shm_perm.mode & ACCESSPERMS); shmseg->u.shm_ctime = time_second; break; case IPC_RMID: error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error != 0) return (error); shm_remove(shmseg, IPCID_TO_IX(shmid)); break; #if 0 case SHM_LOCK: case SHM_UNLOCK: #endif default: error = EINVAL; break; } return (error); } int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz) { int error; SYSVSHM_LOCK(); error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz); SYSVSHM_UNLOCK(); return (error); } #ifndef _SYS_SYSPROTO_H_ struct shmctl_args { int shmid; int cmd; struct shmid_ds *buf; }; #endif int sys_shmctl(struct thread *td, struct shmctl_args *uap) { int error; struct shmid_ds buf; size_t bufsz; /* * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support * Linux binaries. If we see the call come through the FreeBSD ABI, * return an error back to the user since we do not to support this. */ if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO || uap->cmd == SHM_STAT) return (EINVAL); /* IPC_SET needs to copyin the buffer before calling kern_shmctl */ if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds)))) goto done; } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_STAT: error = copyout(&buf, uap->buf, bufsz); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } static int shmget_existing(struct thread *td, struct shmget_args *uap, int mode, int segnum) { struct shmid_kernel *shmseg; #ifdef MAC int error; #endif SYSVSHM_ASSERT_LOCKED(); KASSERT(segnum >= 0 && segnum < shmalloced, ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) return (EEXIST); #ifdef MAC error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg); if (error != 0) return (error); #endif if (uap->size != 0 && uap->size > shmseg->u.shm_segsz) return (EINVAL); td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); return (0); } static int shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode) { struct ucred *cred = td->td_ucred; struct shmid_kernel *shmseg; vm_object_t shm_object; int i, segnum; size_t size; SYSVSHM_ASSERT_LOCKED(); if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) return (EINVAL); if (shm_nused >= shminfo.shmmni) /* Any shmids left? */ return (ENOSPC); size = round_page(uap->size); if (shm_committed + btoc(size) > shminfo.shmall) return (ENOMEM); if (shm_last_free < 0) { shmrealloc(); /* Maybe expand the shmsegs[] array. */ for (i = 0; i < shmalloced; i++) if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE) break; if (i == shmalloced) return (ENOSPC); segnum = i; } else { segnum = shm_last_free; shm_last_free = -1; } KASSERT(segnum >= 0 && segnum < shmalloced, ("segnum %d shmalloced %d", segnum, shmalloced)); shmseg = &shmsegs[segnum]; #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_NSHM, 1)) { PROC_UNLOCK(td->td_proc); return (ENOSPC); } if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) { racct_sub(td->td_proc, RACCT_NSHM, 1); PROC_UNLOCK(td->td_proc); return (ENOMEM); } PROC_UNLOCK(td->td_proc); } #endif /* * We make sure that we have allocated a pager before we need * to. */ shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0, cred); if (shm_object == NULL) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_NSHM, 1); racct_sub(td->td_proc, RACCT_SHMSIZE, size); PROC_UNLOCK(td->td_proc); } #endif return (ENOMEM); } - shm_object->pg_color = 0; - VM_OBJECT_WLOCK(shm_object); - vm_object_clear_flag(shm_object, OBJ_ONEMAPPING); - vm_object_set_flag(shm_object, OBJ_COLORED | OBJ_NOSPLIT); - VM_OBJECT_WUNLOCK(shm_object); shmseg->object = shm_object; shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid; shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; shmseg->u.shm_perm.key = uap->key; shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff; shmseg->cred = crhold(cred); shmseg->u.shm_segsz = uap->size; shmseg->u.shm_cpid = td->td_proc->p_pid; shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0; shmseg->u.shm_atime = shmseg->u.shm_dtime = 0; #ifdef MAC mac_sysvshm_create(cred, shmseg); #endif shmseg->u.shm_ctime = time_second; shm_committed += btoc(size); shm_nused++; td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); return (0); } #ifndef _SYS_SYSPROTO_H_ struct shmget_args { key_t key; size_t size; int shmflg; }; #endif int sys_shmget(struct thread *td, struct shmget_args *uap) { int segnum, mode; int error; if (shm_find_prison(td->td_ucred) == NULL) return (ENOSYS); mode = uap->shmflg & ACCESSPERMS; SYSVSHM_LOCK(); if (uap->key == IPC_PRIVATE) { error = shmget_allocate_segment(td, uap, mode); } else { segnum = shm_find_segment_by_key(td->td_ucred->cr_prison, uap->key); if (segnum >= 0) error = shmget_existing(td, uap, mode, segnum); else if ((uap->shmflg & IPC_CREAT) == 0) error = ENOENT; else error = shmget_allocate_segment(td, uap, mode); } SYSVSHM_UNLOCK(); return (error); } #ifdef SYSVSHM void shmfork(struct proc *p1, struct proc *p2) #else static void shmfork_myhook(struct proc *p1, struct proc *p2) #endif { struct shmmap_state *shmmap_s; size_t size; int i; SYSVSHM_LOCK(); size = shminfo.shmseg * sizeof(struct shmmap_state); shmmap_s = malloc(size, M_SHM, M_WAITOK); bcopy(p1->p_vmspace->vm_shm, shmmap_s, size); p2->p_vmspace->vm_shm = shmmap_s; for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) { if (shmmap_s->shmid != -1) { KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 && IPCID_TO_IX(shmmap_s->shmid) < shmalloced, ("segnum %d shmalloced %d", IPCID_TO_IX(shmmap_s->shmid), shmalloced)); shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++; } } SYSVSHM_UNLOCK(); } #ifdef SYSVSHM void shmexit(struct vmspace *vm) #else static void shmexit_myhook(struct vmspace *vm) #endif { struct shmmap_state *base, *shm; int i; base = vm->vm_shm; if (base != NULL) { vm->vm_shm = NULL; SYSVSHM_LOCK(); for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) { if (shm->shmid != -1) shm_delete_mapping(vm, shm); } SYSVSHM_UNLOCK(); free(base, M_SHM); } } static void shmrealloc(void) { struct shmid_kernel *newsegs; int i; SYSVSHM_ASSERT_LOCKED(); if (shmalloced >= shminfo.shmmni) return; newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK | M_ZERO); for (i = 0; i < shmalloced; i++) bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0])); for (; i < shminfo.shmmni; i++) { newsegs[i].u.shm_perm.mode = SHMSEG_FREE; newsegs[i].u.shm_perm.seq = 0; #ifdef MAC mac_sysvshm_init(&newsegs[i]); #endif } free(shmsegs, M_SHM); shmsegs = newsegs; shmalloced = shminfo.shmmni; } static struct syscall_helper_data shm_syscalls[] = { SYSCALL_INIT_HELPER(shmat), SYSCALL_INIT_HELPER(shmctl), SYSCALL_INIT_HELPER(shmdt), SYSCALL_INIT_HELPER(shmget), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl), #endif #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) SYSCALL_INIT_HELPER(shmsys), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data shm32_syscalls[] = { SYSCALL32_INIT_HELPER_COMPAT(shmat), SYSCALL32_INIT_HELPER_COMPAT(shmdt), SYSCALL32_INIT_HELPER_COMPAT(shmget), SYSCALL32_INIT_HELPER(freebsd32_shmsys), SYSCALL32_INIT_HELPER(freebsd32_shmctl), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl), #endif SYSCALL_INIT_LAST }; #endif static int shminit(void) { struct prison *pr; void **rsv; int i, error; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CHECK] = shm_prison_check, [PR_METHOD_SET] = shm_prison_set, [PR_METHOD_GET] = shm_prison_get, [PR_METHOD_REMOVE] = shm_prison_remove, }; #ifndef BURN_BRIDGES if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0) printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n"); #endif if (shminfo.shmmax == SHMMAX) { /* Initialize shmmax dealing with possible overflow. */ for (i = PAGE_SIZE; i != 0; i--) { shminfo.shmmax = shminfo.shmall * i; if ((shminfo.shmmax / shminfo.shmall) == (u_long)i) break; } } shmalloced = shminfo.shmmni; shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK|M_ZERO); for (i = 0; i < shmalloced; i++) { shmsegs[i].u.shm_perm.mode = SHMSEG_FREE; shmsegs[i].u.shm_perm.seq = 0; #ifdef MAC mac_sysvshm_init(&shmsegs[i]); #endif } shm_last_free = 0; shm_nused = 0; shm_committed = 0; sx_init(&sysvshmsx, "sysvshmsx"); #ifndef SYSVSHM shmexit_hook = &shmexit_myhook; shmfork_hook = &shmfork_myhook; #endif /* Set current prisons according to their allow.sysvipc. */ shm_prison_slot = osd_jail_register(NULL, methods); rsv = osd_reserve(shm_prison_slot); prison_lock(&prison0); (void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0); prison_unlock(&prison0); rsv = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { if (rsv == NULL) rsv = osd_reserve(shm_prison_slot); prison_lock(pr); if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { (void)osd_jail_set_reserved(pr, shm_prison_slot, rsv, &prison0); rsv = NULL; } prison_unlock(pr); } if (rsv != NULL) osd_free_reserved(rsv); sx_sunlock(&allprison_lock); error = syscall_helper_register(shm_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(shm32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int shmunload(void) { int i; if (shm_nused > 0) return (EBUSY); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(shm32_syscalls); #endif syscall_helper_unregister(shm_syscalls); if (shm_prison_slot != 0) osd_jail_deregister(shm_prison_slot); for (i = 0; i < shmalloced; i++) { #ifdef MAC mac_sysvshm_destroy(&shmsegs[i]); #endif /* * Objects might be still mapped into the processes * address spaces. Actual free would happen on the * last mapping destruction. */ if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE) vm_object_deallocate(shmsegs[i].object); } free(shmsegs, M_SHM); #ifndef SYSVSHM shmexit_hook = NULL; shmfork_hook = NULL; #endif sx_destroy(&sysvshmsx); return (0); } static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS) { struct shmid_kernel tshmseg; #ifdef COMPAT_FREEBSD32 struct shmid_kernel32 tshmseg32; #endif struct prison *pr, *rpr; void *outaddr; size_t outsize; int error, i; SYSVSHM_LOCK(); pr = req->td->td_ucred->cr_prison; rpr = shm_find_prison(req->td->td_ucred); error = 0; for (i = 0; i < shmalloced; i++) { if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) { bzero(&tshmseg, sizeof(tshmseg)); tshmseg.u.shm_perm.mode = SHMSEG_FREE; } else { tshmseg = shmsegs[i]; if (tshmseg.cred->cr_prison != pr) tshmseg.u.shm_perm.key = IPC_PRIVATE; } #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { bzero(&tshmseg32, sizeof(tshmseg32)); freebsd32_ipcperm_out(&tshmseg.u.shm_perm, &tshmseg32.u.shm_perm); CP(tshmseg, tshmseg32, u.shm_segsz); CP(tshmseg, tshmseg32, u.shm_lpid); CP(tshmseg, tshmseg32, u.shm_cpid); CP(tshmseg, tshmseg32, u.shm_nattch); CP(tshmseg, tshmseg32, u.shm_atime); CP(tshmseg, tshmseg32, u.shm_dtime); CP(tshmseg, tshmseg32, u.shm_ctime); /* Don't copy object, label, or cred */ outaddr = &tshmseg32; outsize = sizeof(tshmseg32); } else #endif { tshmseg.object = NULL; tshmseg.label = NULL; tshmseg.cred = NULL; outaddr = &tshmseg; outsize = sizeof(tshmseg); } error = SYSCTL_OUT(req, outaddr, outsize); if (error != 0) break; } SYSVSHM_UNLOCK(); return (error); } static int shm_prison_check(void *obj, void *data) { struct prison *pr = obj; struct prison *prpr; struct vfsoptlist *opts = data; int error, jsys; /* * sysvshm is a jailsys integer. * It must be "disable" if the parent jail is disabled. */ error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)); if (error != ENOENT) { if (error != 0) return (error); switch (jsys) { case JAIL_SYS_DISABLE: break; case JAIL_SYS_NEW: case JAIL_SYS_INHERIT: prison_lock(pr->pr_parent); prpr = osd_jail_get(pr->pr_parent, shm_prison_slot); prison_unlock(pr->pr_parent); if (prpr == NULL) return (EPERM); break; default: return (EINVAL); } } return (0); } static int shm_prison_set(void *obj, void *data) { struct prison *pr = obj; struct prison *tpr, *orpr, *nrpr, *trpr; struct vfsoptlist *opts = data; void *rsv; int jsys, descend; /* * sysvshm controls which jail is the root of the associated segments * (this jail or same as the parent), or if the feature is available * at all. */ if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT) jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) ? JAIL_SYS_INHERIT : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) ? JAIL_SYS_DISABLE : -1; if (jsys == JAIL_SYS_DISABLE) { prison_lock(pr); orpr = osd_jail_get(pr, shm_prison_slot); if (orpr != NULL) osd_jail_del(pr, shm_prison_slot); prison_unlock(pr); if (orpr != NULL) { if (orpr == pr) shm_prison_cleanup(pr); /* Disable all child jails as well. */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, shm_prison_slot); if (trpr != NULL) { osd_jail_del(tpr, shm_prison_slot); prison_unlock(tpr); if (trpr == tpr) shm_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } else if (jsys != -1) { if (jsys == JAIL_SYS_NEW) nrpr = pr; else { prison_lock(pr->pr_parent); nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot); prison_unlock(pr->pr_parent); } rsv = osd_reserve(shm_prison_slot); prison_lock(pr); orpr = osd_jail_get(pr, shm_prison_slot); if (orpr != nrpr) (void)osd_jail_set_reserved(pr, shm_prison_slot, rsv, nrpr); else osd_free_reserved(rsv); prison_unlock(pr); if (orpr != nrpr) { if (orpr == pr) shm_prison_cleanup(pr); if (orpr != NULL) { /* Change child jails matching the old root, */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, shm_prison_slot); if (trpr == orpr) { (void)osd_jail_set(tpr, shm_prison_slot, nrpr); prison_unlock(tpr); if (trpr == tpr) shm_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } } return (0); } static int shm_prison_get(void *obj, void *data) { struct prison *pr = obj; struct prison *rpr; struct vfsoptlist *opts = data; int error, jsys; /* Set sysvshm based on the jail's root prison. */ prison_lock(pr); rpr = osd_jail_get(pr, shm_prison_slot); prison_unlock(pr); jsys = rpr == NULL ? JAIL_SYS_DISABLE : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys)); if (error == ENOENT) error = 0; return (error); } static int shm_prison_remove(void *obj, void *data __unused) { struct prison *pr = obj; struct prison *rpr; SYSVSHM_LOCK(); prison_lock(pr); rpr = osd_jail_get(pr, shm_prison_slot); prison_unlock(pr); if (rpr == pr) shm_prison_cleanup(pr); SYSVSHM_UNLOCK(); return (0); } static void shm_prison_cleanup(struct prison *pr) { struct shmid_kernel *shmseg; int i; /* Remove any segments that belong to this jail. */ for (i = 0; i < shmalloced; i++) { shmseg = &shmsegs[i]; if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) && shmseg->cred != NULL && shmseg->cred->cr_prison == pr) { shm_remove(shmseg, i); } } } SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory"); #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) struct oshmid_ds { struct ipc_perm_old shm_perm; /* operation perms */ int shm_segsz; /* size of segment (bytes) */ u_short shm_cpid; /* pid, creator */ u_short shm_lpid; /* pid, last operation */ short shm_nattch; /* no. of current attaches */ time_t shm_atime; /* last attach time */ time_t shm_dtime; /* last detach time */ time_t shm_ctime; /* last change time */ void *shm_handle; /* internal handle for shm segment */ }; struct oshmctl_args { int shmid; int cmd; struct oshmid_ds *ubuf; }; static int oshmctl(struct thread *td, struct oshmctl_args *uap) { #ifdef COMPAT_43 int error = 0; struct prison *rpr; struct shmid_kernel *shmseg; struct oshmid_ds outbuf; rpr = shm_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); if (uap->cmd != IPC_STAT) { return (freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap)); } SYSVSHM_LOCK(); shmseg = shm_find_segment(rpr, uap->shmid, true); if (shmseg == NULL) { SYSVSHM_UNLOCK(); return (EINVAL); } error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error != 0) { SYSVSHM_UNLOCK(); return (error); } #ifdef MAC error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd); if (error != 0) { SYSVSHM_UNLOCK(); return (error); } #endif ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm); outbuf.shm_segsz = shmseg->u.shm_segsz; outbuf.shm_cpid = shmseg->u.shm_cpid; outbuf.shm_lpid = shmseg->u.shm_lpid; outbuf.shm_nattch = shmseg->u.shm_nattch; outbuf.shm_atime = shmseg->u.shm_atime; outbuf.shm_dtime = shmseg->u.shm_dtime; outbuf.shm_ctime = shmseg->u.shm_ctime; outbuf.shm_handle = shmseg->object; SYSVSHM_UNLOCK(); return (copyout(&outbuf, uap->ubuf, sizeof(outbuf))); #else return (EINVAL); #endif } /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *shmcalls[] = { (sy_call_t *)sys_shmat, (sy_call_t *)oshmctl, (sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget, (sy_call_t *)freebsd7_shmctl }; #ifndef _SYS_SYSPROTO_H_ /* XXX actually varargs. */ struct shmsys_args { int which; int a2; int a3; int a4; }; #endif int sys_shmsys(struct thread *td, struct shmsys_args *uap) { AUDIT_ARG_SVIPC_WHICH(uap->which); if (uap->which < 0 || uap->which >= nitems(shmcalls)) return (EINVAL); return ((*shmcalls[uap->which])(td, &uap->a2)); } #endif /* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */ #ifdef COMPAT_FREEBSD32 int freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) AUDIT_ARG_SVIPC_WHICH(uap->which); switch (uap->which) { case 0: { /* shmat */ struct shmat_args ap; ap.shmid = uap->a2; ap.shmaddr = PTRIN(uap->a3); ap.shmflg = uap->a4; return (sysent[SYS_shmat].sy_call(td, &ap)); } case 2: { /* shmdt */ struct shmdt_args ap; ap.shmaddr = PTRIN(uap->a2); return (sysent[SYS_shmdt].sy_call(td, &ap)); } case 3: { /* shmget */ struct shmget_args ap; ap.key = uap->a2; ap.size = uap->a3; ap.shmflg = uap->a4; return (sysent[SYS_shmget].sy_call(td, &ap)); } case 4: { /* shmctl */ struct freebsd7_freebsd32_shmctl_args ap; ap.shmid = uap->a2; ap.cmd = uap->a3; ap.buf = PTRIN(uap->a4); return (freebsd7_freebsd32_shmctl(td, &ap)); } case 1: /* oshmctl */ default: return (EINVAL); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_shmctl(struct thread *td, struct freebsd7_freebsd32_shmctl_args *uap) { int error; union { struct shmid_ds shmid_ds; struct shm_info shm_info; struct shminfo shminfo; } u; union { struct shmid_ds32_old shmid_ds32; struct shm_info32 shm_info32; struct shminfo32 shminfo32; } u32; size_t sz; if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &u32.shmid_ds32, sizeof(u32.shmid_ds32)))) goto done; freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm, &u.shmid_ds.shm_perm); CP(u32.shmid_ds32, u.shmid_ds, shm_segsz); CP(u32.shmid_ds32, u.shmid_ds, shm_lpid); CP(u32.shmid_ds32, u.shmid_ds, shm_cpid); CP(u32.shmid_ds32, u.shmid_ds, shm_nattch); CP(u32.shmid_ds32, u.shmid_ds, shm_atime); CP(u32.shmid_ds32, u.shmid_ds, shm_dtime); CP(u32.shmid_ds32, u.shmid_ds, shm_ctime); } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_INFO: CP(u.shminfo, u32.shminfo32, shmmax); CP(u.shminfo, u32.shminfo32, shmmin); CP(u.shminfo, u32.shminfo32, shmmni); CP(u.shminfo, u32.shminfo32, shmseg); CP(u.shminfo, u32.shminfo32, shmall); error = copyout(&u32.shminfo32, uap->buf, sizeof(u32.shminfo32)); break; case SHM_INFO: CP(u.shm_info, u32.shm_info32, used_ids); CP(u.shm_info, u32.shm_info32, shm_rss); CP(u.shm_info, u32.shm_info32, shm_tot); CP(u.shm_info, u32.shm_info32, shm_swp); CP(u.shm_info, u32.shm_info32, swap_attempts); CP(u.shm_info, u32.shm_info32, swap_successes); error = copyout(&u32.shm_info32, uap->buf, sizeof(u32.shm_info32)); break; case SHM_STAT: case IPC_STAT: memset(&u32.shmid_ds32, 0, sizeof(u32.shmid_ds32)); freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm, &u32.shmid_ds32.shm_perm); if (u.shmid_ds.shm_segsz > INT32_MAX) u32.shmid_ds32.shm_segsz = INT32_MAX; else CP(u.shmid_ds, u32.shmid_ds32, shm_segsz); CP(u.shmid_ds, u32.shmid_ds32, shm_lpid); CP(u.shmid_ds, u32.shmid_ds32, shm_cpid); CP(u.shmid_ds, u32.shmid_ds32, shm_nattch); CP(u.shmid_ds, u32.shmid_ds32, shm_atime); CP(u.shmid_ds, u32.shmid_ds32, shm_dtime); CP(u.shmid_ds, u32.shmid_ds32, shm_ctime); u32.shmid_ds32.shm_internal = 0; error = copyout(&u32.shmid_ds32, uap->buf, sizeof(u32.shmid_ds32)); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #endif int freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap) { int error; union { struct shmid_ds shmid_ds; struct shm_info shm_info; struct shminfo shminfo; } u; union { struct shmid_ds32 shmid_ds32; struct shm_info32 shm_info32; struct shminfo32 shminfo32; } u32; size_t sz; if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &u32.shmid_ds32, sizeof(u32.shmid_ds32)))) goto done; freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm, &u.shmid_ds.shm_perm); CP(u32.shmid_ds32, u.shmid_ds, shm_segsz); CP(u32.shmid_ds32, u.shmid_ds, shm_lpid); CP(u32.shmid_ds32, u.shmid_ds, shm_cpid); CP(u32.shmid_ds32, u.shmid_ds, shm_nattch); CP(u32.shmid_ds32, u.shmid_ds, shm_atime); CP(u32.shmid_ds32, u.shmid_ds, shm_dtime); CP(u32.shmid_ds32, u.shmid_ds, shm_ctime); } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_INFO: CP(u.shminfo, u32.shminfo32, shmmax); CP(u.shminfo, u32.shminfo32, shmmin); CP(u.shminfo, u32.shminfo32, shmmni); CP(u.shminfo, u32.shminfo32, shmseg); CP(u.shminfo, u32.shminfo32, shmall); error = copyout(&u32.shminfo32, uap->buf, sizeof(u32.shminfo32)); break; case SHM_INFO: CP(u.shm_info, u32.shm_info32, used_ids); CP(u.shm_info, u32.shm_info32, shm_rss); CP(u.shm_info, u32.shm_info32, shm_tot); CP(u.shm_info, u32.shm_info32, shm_swp); CP(u.shm_info, u32.shm_info32, swap_attempts); CP(u.shm_info, u32.shm_info32, swap_successes); error = copyout(&u32.shm_info32, uap->buf, sizeof(u32.shm_info32)); break; case SHM_STAT: case IPC_STAT: freebsd32_ipcperm_out(&u.shmid_ds.shm_perm, &u32.shmid_ds32.shm_perm); if (u.shmid_ds.shm_segsz > INT32_MAX) u32.shmid_ds32.shm_segsz = INT32_MAX; else CP(u.shmid_ds, u32.shmid_ds32, shm_segsz); CP(u.shmid_ds, u32.shmid_ds32, shm_lpid); CP(u.shmid_ds, u32.shmid_ds32, shm_cpid); CP(u.shmid_ds, u32.shmid_ds32, shm_nattch); CP(u.shmid_ds, u32.shmid_ds32, shm_atime); CP(u.shmid_ds, u32.shmid_ds32, shm_dtime); CP(u.shmid_ds, u32.shmid_ds32, shm_ctime); error = copyout(&u32.shmid_ds32, uap->buf, sizeof(u32.shmid_ds32)); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #endif #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7_shmctl_args { int shmid; int cmd; struct shmid_ds_old *buf; }; #endif int freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap) { int error; struct shmid_ds_old old; struct shmid_ds buf; size_t bufsz; /* * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support * Linux binaries. If we see the call come through the FreeBSD ABI, * return an error back to the user since we do not to support this. */ if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO || uap->cmd == SHM_STAT) return (EINVAL); /* IPC_SET needs to copyin the buffer before calling kern_shmctl */ if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &old, sizeof(old)))) goto done; ipcperm_old2new(&old.shm_perm, &buf.shm_perm); CP(old, buf, shm_segsz); CP(old, buf, shm_lpid); CP(old, buf, shm_cpid); CP(old, buf, shm_nattch); CP(old, buf, shm_atime); CP(old, buf, shm_dtime); CP(old, buf, shm_ctime); } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_STAT: memset(&old, 0, sizeof(old)); ipcperm_new2old(&buf.shm_perm, &old.shm_perm); if (buf.shm_segsz > INT_MAX) old.shm_segsz = INT_MAX; else CP(buf, old, shm_segsz); CP(buf, old, shm_lpid); CP(buf, old, shm_cpid); if (buf.shm_nattch > SHRT_MAX) old.shm_nattch = SHRT_MAX; else CP(buf, old, shm_nattch); CP(buf, old, shm_atime); CP(buf, old, shm_dtime); CP(buf, old, shm_ctime); old.shm_internal = NULL; error = copyout(&old, uap->buf, sizeof(old)); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 || COMPAT_FREEBSD7 */ static int sysvshm_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = shminit(); if (error != 0) shmunload(); break; case MOD_UNLOAD: error = shmunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvshm_mod = { "sysvshm", &sysvshm_modload, NULL }; DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST); MODULE_VERSION(sysvshm, 1); Index: head/sys/kern/uipc_shm.c =================================================================== --- head/sys/kern/uipc_shm.c (revision 354868) +++ head/sys/kern/uipc_shm.c (revision 354869) @@ -1,1512 +1,1507 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2), shm_rename(2), and shm_unlink(2). * While most of the implementation is here, vm_mmap.c contains * mapping logic changes. * * posixshmcontrol(1) allows users to inspect the state of the memory * objects. Per-uid swap resource limit controls total amount of * memory that user can consume for anonymous objects, including * shared. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr64 shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_ioctl_t shm_ioctl; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; static fo_get_seals_t shm_get_seals; static fo_add_seals_t shm_add_seals; /* File descriptor operations. */ struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = shm_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ rv = vm_page_grab_valid(&m, obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY); if (rv != VM_PAGER_OK) { VM_OBJECT_WUNLOCK(obj); printf("uiomove_object: vm_obj %p idx %jd pager error %d\n", obj, idx, rv); return (EIO); } VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_unwire(m, PQ_ACTIVE); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) error = EPERM; else error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { switch (com) { case FIONBIO: case FIOASYNC: /* * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, * just like it would on an unlinked regular file */ return (0); default: return (ENOTTY); } } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = howmany(sb->st_size, sb->st_blksize); mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } static int shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) { int error; char *path; const char *pr_path; size_t pr_pathlen; path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath_in, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error != 0) goto out; #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (path[pr_pathlen] != '/') { error = EINVAL; goto out; } *path_out = path; out: if (error != 0) free(path, M_SHMFD); return (error); } static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) { vm_object_t object; vm_page_t m; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_ASSERT_WLOCKED(object); rangelock_cookie_assert(rl_cookie, RA_WLOCKED); if (length == shmfd->shm_size) return (0); nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) return (EPERM); /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) return (EBUSY); /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); if (m != NULL) { MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, * and therefore not recently * accessed, immediately enqueue it * for asynchronous laundering. The * current operation is not regarded * as an access. */ vm_page_launder(m); } else { vm_page_free(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(vm_page_all_valid(m), ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); } } delta = IDX_TO_OFF(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { if ((shmfd->shm_seals & F_SEAL_GROW) != 0) return (EPERM); /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) return (ENOMEM); object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; return (0); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { void *rl_cookie; int error; rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); VM_OBJECT_WLOCK(shmfd->shm_object); error = shm_dotruncate_locked(shmfd, length, rl_cookie); VM_OBJECT_WUNLOCK(shmfd->shm_object); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); - shmfd->shm_object->pg_color = 0; - VM_OBJECT_WLOCK(shmfd->shm_object); - vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); - vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); - VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; shmfd->shm_ino = alloc_unr64(&shm_ino_unr); refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps, int initial_seals) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; void *rl_cookie; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be * taken below to ensure that the seals are properly set if the shmfd * already existed -- this currently assumes that only F_SEAL_SEAL can * be set and doesn't take further precautions to ensure the validity of * the seals being added with respect to current mappings. */ if ((initial_seals & ~F_SEAL_SEAL) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; /* * shm_open(2) created shm should always have O_CLOEXEC set, as mandated * by POSIX. We allow it to be unset here so that an in-kernel * interface may be written as a thin layer around shm, optionally not * setting CLOEXEC. For shm_open(2), O_CLOEXEC is set unconditionally * in sys_shm_open() to keep this implementation compliant. */ error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); shmfd->shm_seals = initial_seals; } else { error = shm_copyin_path(td, userpath, &path); if (error != 0) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shmfd->shm_seals = initial_seals; shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); /* * kern_shm_open() likely shouldn't ever error out on * trying to set a seal that already exists, unlike * F_ADD_SEALS. This would break terribly as * shm_open(2) actually sets F_SEAL_SEAL to maintain * historical behavior where the underlying file could * not be sealed. */ initial_seals &= ~shmfd->shm_seals; /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); /* * initial_seals can't set additional seals if we've * already been set F_SEAL_SEAL. If F_SEAL_SEAL is set, * then we've already removed that one from * initial_seals. This is currently redundant as we * only allow setting F_SEAL_SEAL at creation time, but * it's cheap to check and decreases the effort required * to allow additional seals. */ if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 && initial_seals != 0) error = EPERM; else if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { VM_OBJECT_WLOCK(shmfd->shm_object); #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif error = shm_dotruncate_locked(shmfd, 0, rl_cookie); VM_OBJECT_WUNLOCK(shmfd->shm_object); } if (error == 0) { /* * Currently we only allow F_SEAL_SEAL to be * set initially. As noted above, this would * need to be reworked should that change. */ shmfd->shm_seals |= initial_seals; shm_hold(shmfd); } rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ #ifdef COMPAT_FREEBSD12 int freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode, NULL, F_SEAL_SEAL)); } #endif int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; Fnv32_t fnv; int error; error = shm_copyin_path(td, uap->path, &path); if (error != 0) return (error); AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } int sys_shm_rename(struct thread *td, struct shm_rename_args *uap) { char *path_from = NULL, *path_to = NULL; Fnv32_t fnv_from, fnv_to; struct shmfd *fd_from; struct shmfd *fd_to; int error; int flags; flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Make sure the user passed only valid flags. * If you add a new flag, please add a new term here. */ if ((flags & ~( SHM_RENAME_NOREPLACE | SHM_RENAME_EXCHANGE )) != 0) { error = EINVAL; goto out; } /* * EXCHANGE and NOREPLACE don't quite make sense together. Let's * force the user to choose one or the other. */ if ((flags & SHM_RENAME_NOREPLACE) != 0 && (flags & SHM_RENAME_EXCHANGE) != 0) { error = EINVAL; goto out; } /* Renaming to or from anonymous makes no sense */ if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) { error = EINVAL; goto out; } error = shm_copyin_path(td, uap->path_from, &path_from); if (error != 0) goto out; error = shm_copyin_path(td, uap->path_to, &path_to); if (error != 0) goto out; AUDIT_ARG_UPATH1_CANON(path_from); AUDIT_ARG_UPATH2_CANON(path_to); /* Rename with from/to equal is a no-op */ if (strcmp(path_from, path_to) == 0) goto out; fnv_from = fnv_32_str(path_from, FNV1_32_INIT); fnv_to = fnv_32_str(path_to, FNV1_32_INIT); sx_xlock(&shm_dict_lock); fd_from = shm_lookup(path_from, fnv_from); if (fd_from == NULL) { error = ENOENT; goto out_locked; } fd_to = shm_lookup(path_to, fnv_to); if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) { error = EEXIST; goto out_locked; } /* * Unconditionally prevents shm_remove from invalidating the 'from' * shm's state. */ shm_hold(fd_from); error = shm_remove(path_from, fnv_from, td->td_ucred); /* * One of my assumptions failed if ENOENT (e.g. locking didn't * protect us) */ KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s", path_from)); if (error != 0) { shm_drop(fd_from); goto out_locked; } /* * If we are exchanging, we need to ensure the shm_remove below * doesn't invalidate the dest shm's state. */ if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) shm_hold(fd_to); /* * NOTE: if path_to is not already in the hash, c'est la vie; * it simply means we have nothing already at path_to to unlink. * That is the ENOENT case. * * If we somehow don't have access to unlink this guy, but * did for the shm at path_from, then relink the shm to path_from * and abort with EACCES. * * All other errors: that is weird; let's relink and abort the * operation. */ error = shm_remove(path_to, fnv_to, td->td_ucred); if (error != 0 && error != ENOENT) { shm_insert(path_from, fnv_from, fd_from); shm_drop(fd_from); /* Don't free path_from now, since the hash references it */ path_from = NULL; goto out_locked; } error = 0; shm_insert(path_to, fnv_to, fd_from); /* Don't free path_to now, since the hash references it */ path_to = NULL; /* We kept a ref when we removed, and incremented again in insert */ shm_drop(fd_from); KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n", fd_from->shm_refs)); if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) { shm_insert(path_from, fnv_from, fd_to); path_from = NULL; shm_drop(fd_to); KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n", fd_to->shm_refs)); } out_locked: sx_xunlock(&shm_dict_lock); out: free(path_from, M_SHMFD); free(path_to, M_SHMFD); return (error); } int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; bool writecnt; void *rl_cookie; shmfd = fp->f_data; maxprot = VM_PROT_NONE; rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize, &shmfd->shm_mtx); /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; writecnt = (flags & MAP_SHARED) != 0 && (prot & VM_PROT_WRITE) != 0; if (writecnt && (shmfd->shm_seals & F_SEAL_WRITE) != 0) { error = EPERM; goto out; } /* Don't permit shared writable mappings on read-only descriptors. */ if (writecnt && (maxprot & VM_PROT_WRITE) == 0) { error = EACCES; goto out; } maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - objsize) { error = EINVAL; goto out; } #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) goto out; #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); if (writecnt) vm_pager_update_writecount(shmfd->shm_object, 0, objsize); error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, writecnt, td); if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, objsize); vm_object_deallocate(shmfd->shm_object); } out: rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) { const char *path, *pr_path; size_t pr_pathlen; bool visible; sx_assert(&shm_dict_lock, SA_LOCKED); kif->kf_type = KF_TYPE_SHM; kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); visible = strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/'; if (list && !visible) return (EPERM); if (visible) path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } } return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp __unused) { int res; sx_slock(&shm_dict_lock); res = shm_fill_kinfo_locked(fp->f_data, kif, false); sx_sunlock(&shm_dict_lock); return (res); } static int shm_add_seals(struct file *fp, int seals) { struct shmfd *shmfd; void *rl_cookie; vm_ooffset_t writemappings; int error, nseals; error = 0; shmfd = fp->f_data; rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); /* Even already-set seals should result in EPERM. */ if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) { error = EPERM; goto out; } nseals = seals & ~shmfd->shm_seals; if ((nseals & F_SEAL_WRITE) != 0) { /* * The rangelock above prevents writable mappings from being * added after we've started applying seals. The RLOCK here * is to avoid torn reads on ILP32 arches as unmapping/reducing * writemappings will be done without a rangelock. */ VM_OBJECT_RLOCK(shmfd->shm_object); writemappings = shmfd->shm_object->un_pager.swp.writemappings; VM_OBJECT_RUNLOCK(shmfd->shm_object); /* kmappings are also writable */ if (writemappings > 0) { error = EBUSY; goto out; } } shmfd->shm_seals |= nseals; out: rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } static int shm_get_seals(struct file *fp, int *seals) { struct shmfd *shmfd; shmfd = fp->f_data; *seals = shmfd->shm_seals; return (0); } static int sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) { struct shm_mapping *shmm; struct sbuf sb; struct kinfo_file kif; u_long i; ssize_t curlen; int error, error2; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); curlen = 0; error = 0; sx_slock(&shm_dict_lock); for (i = 0; i < shm_hash + 1; i++) { LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { error = shm_fill_kinfo_locked(shmm->sm_shmfd, &kif, true); if (error == EPERM) continue; if (error != 0) break; pack_kinfo(&kif); if (req->oldptr != NULL && kif.kf_structsize + curlen > req->oldlen) break; error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 0 : ENOMEM; if (error != 0) break; curlen += kif.kf_structsize; } } sx_sunlock(&shm_dict_lock); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, NULL, 0, sysctl_posix_shm_list, "", "POSIX SHM list"); int kern_shm_open2(struct thread *td, const char *path, int flags, mode_t mode, int shmflags, const char *name __unused) { int initial_seals; if ((shmflags & ~SHM_ALLOW_SEALING) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; if ((shmflags & SHM_ALLOW_SEALING) != 0) initial_seals &= ~F_SEAL_SEAL; return (kern_shm_open(td, path, flags, mode, NULL, initial_seals)); } /* * This version of the shm_open() interface leaves CLOEXEC behavior up to the * caller, and libc will enforce it for the traditional shm_open() call. This * allows other consumers, like memfd_create(), to opt-in for CLOEXEC. This * interface also includes a 'name' argument that is currently unused, but could * potentially be exported later via some interface for debugging purposes. * From the kernel's perspective, it is optional. Individual consumers like * memfd_create() may require it in order to be compatible with other systems * implementing the same function. */ int sys_shm_open2(struct thread *td, struct shm_open2_args *uap) { return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, uap->shmflags, uap->name)); } Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 354868) +++ head/sys/vm/swap_pager.c (revision 354869) @@ -1,3057 +1,3057 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64. * The 64-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 32 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif #define SWAP_META_PAGES PCTRIE_COUNT /* * A swblk structure maps each page index within a * SWAP_META_PAGES-aligned and sized range to the address of an * on-disk swap block (or SWAPBLK_NONE). The collection of these * mappings for an entire vm object is implemented as a pc-trie. */ struct swblk { vm_pindex_t p; daddr_t d[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static u_long swap_reserved; static u_long swap_total; static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_total, 0, sysctl_page_shift, "A", "Total amount of available swap storage."); static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) static int sysctl_page_shift(SYSCTL_HANDLER_ARGS) { uint64_t newval; u_long value = *(u_long *)arg1; newval = ((uint64_t)value) << PAGE_SHIFT; return (sysctl_handle_64(oidp, &newval, 0, req)); } int swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { u_long r, s, prev, pincr; int res, error; static int curfail; static struct timeval lastfail; struct uidinfo *uip; uip = cred->cr_ruidinfo; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (0); } #endif pincr = atop(incr); res = 0; prev = atomic_fetchadd_long(&swap_reserved, pincr); r = prev + pincr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; } else { prev = atomic_fetchadd_long(&swap_reserved, -pincr); if (prev < pincr) panic("swap_reserved < incr on overcommit fail"); } if (res) { prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) { res = 0; prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); if (prev < pincr) panic("uip->ui_vmsize < incr on overcommit fail"); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", uip->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (racct_enable && !res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (res); } void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; u_long pincr; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); PROC_LOCK(curproc); #ifdef RACCT if (racct_enable) racct_add_force(curproc, RACCT_SWAP, incr); #endif pincr = atop(incr); atomic_add_long(&swap_reserved, pincr); uip = curproc->p_ucred->cr_ruidinfo; atomic_add_long(&uip->ui_vmsize, pincr); PROC_UNLOCK(curproc); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { u_long prev, pdecr; struct uidinfo *uip; uip = cred->cr_ruidinfo; KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, (uintmax_t)decr)); pdecr = atop(decr); prev = atomic_fetchadd_long(&swap_reserved, -pdecr); if (prev < pdecr) panic("swap_reserved < decr"); prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); if (prev < pdecr) printf("negative vmsize for uid = %d\n", uip->ui_uid); #ifdef RACCT if (racct_enable) racct_sub_cred(cred, RACCT_SWAP, decr); #endif } static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", "Swap Fragmentation Info"); static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swwbuf_zone; static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ .pgo_update_writecount = swap_pager_update_writecount, .pgo_release_writecount = swap_pager_release_writecount, }; /* * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0, "Maximum size of a swap block in pages"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); static void swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); static daddr_t swp_pager_getswapspace(int *npages, int limit); /* * Metadata functions */ static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst, vm_pindex_t pindex, vm_pindex_t count); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t); static void swp_pager_init_freerange(daddr_t *start, daddr_t *num) { *start = SWAPBLK_NONE; *num = 0; } static void swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) { if (*start + *num == addr) { (*num)++; } else { swp_pager_freeswapspace(*start, *num); *start = addr; *num = 1; } } static void * swblk_trie_alloc(struct pctrie *ptree) { return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0))); } static void swblk_trie_free(struct pctrie *ptree, void *node) { uma_zfree(swpctrie_zone, node); } PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array, which has MAXPHYS / PAGE_SIZE entries, and our locally * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); /* * Initialize our zone, taking the user's requested size or * estimating the number we need based on the number of pages * in the system. */ n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : vm_cnt.v_page_count / 2; swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; do { if (uma_zone_reserve_kva(swblk_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); /* * Often uma_zone_reserve_kva() cannot reserve exactly the * requested size. Account for the difference when * calculating swap_maxpages. */ n = uma_zone_get_max(swblk_zone); if (n < n2) printf("Swap blk zone entries changed from %lu to %lu.\n", n2, n); /* absolute maximum we can handle assuming 100% efficiency */ swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblk); if (!uma_zone_reserve_kva(swpctrie_zone, n)) printf("Cannot reserve swap pctrie zone, " "reduce kern.maxswzone.\n"); } static vm_object_t swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } /* * The un_pager.swp.swp_blks trie is initialized by * vm_object_allocate() to ensure the correct order of * visibility to other threads. */ object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK + size)); object->un_pager.swp.writemappings = 0; object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for up to the requested number of pages, and at * least a minimum number of pages. The starting swap block number * (a page index) is returned or SWAPBLK_NONE if the allocation * failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int *io_npages, int limit) { daddr_t blk; struct swdevt *sp; int mpages, npages; blk = SWAPBLK_NONE; mpages = *io_npages; npages = imin(BLIST_MAX_ALLOC, mpages); mtx_lock(&sw_dev_mtx); sp = swdevhd; while (!TAILQ_EMPTY(&swtailq)) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if ((sp->sw_flags & SW_CLOSING) == 0) blk = blist_alloc(sp->sw_blist, &npages, mpages); if (blk != SWAPBLK_NONE) break; sp = TAILQ_NEXT(sp, sw_list); if (swdevhd == sp) { if (npages <= limit) break; mpages = npages - 1; npages >>= 1; } } if (blk != SWAPBLK_NONE) { *io_npages = npages; blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { if (swap_pager_full != 2) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; } mtx_unlock(&sw_dev_mtx); return (blk); } static bool swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(bp->b_blkno, sp)) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages) { struct swdevt *sp; if (npages == 0) return; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(blk, sp)) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats */ static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct swdevt *sp; const char *devname; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); blist_stats(sp->sw_blist, &sbuf); } mtx_unlock(&sw_dev_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { daddr_t addr, blk, n_free, s_free; int i, j, n; swp_pager_init_freerange(&s_free, &n_free); VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += n) { n = size - i; blk = swp_pager_getswapspace(&n, 1); if (blk == SWAPBLK_NONE) { swp_pager_meta_free(object, start, i); VM_OBJECT_WUNLOCK(object); return (-1); } for (j = 0; j < n; ++j) { addr = swp_pager_meta_build(object, start + i + j, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); } } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WUNLOCK(object); return (0); } static bool swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, daddr_t addr) { daddr_t dstaddr; KASSERT(srcobject->type == OBJT_SWAP, ("%s: Srcobject not swappable", __func__)); if (dstobject->type == OBJT_SWAP && swp_pager_meta_lookup(dstobject, pindex) != SWAPBLK_NONE) { /* Caller should destroy the source block. */ return (false); } /* * Destination has no swapblk and is not resident, transfer source. * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); dstaddr = swp_pager_meta_build(dstobject, pindex, addr); KASSERT(dstaddr == SWAPBLK_NONE, ("Unexpected destination swapblk")); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); return (true); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && srcobject->handle != NULL) { vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } /* * Transfer source to destination. */ swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size); /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); KASSERT(object->type == OBJT_SWAP, ("%s: object not swappable", __func__)); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_lookup(object, pindex); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_lookup(object, pindex - i); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_lookup(object, pindex + i); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page must be locked. */ static void swap_pager_unswapped(vm_page_t m) { struct swblk *sb; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ KASSERT(m->object->type == OBJT_SWAP, ("Free object not swappable")); sb = SWAP_PCTRIE_LOOKUP(&m->object->un_pager.swp.swp_blks, rounddown(m->pindex, SWAP_META_PAGES)); if (sb == NULL) return; if (sb->d[m->pindex % SWAP_META_PAGES] == SWAPBLK_NONE) return; swp_pager_freeswapspace(sb->d[m->pindex % SWAP_META_PAGES], 1); sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE; swp_pager_free_empty_swblk(m->object, sb); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "ma" of length "count". The * caller may optionally specify that additional pages preceding and * succeeding the specified range be paged in. The number of such pages * is returned in the "rbehind" and "rahead" parameters, and they will * be in the inactive queue upon return. * * The pages in "ma" must be busied and will remain busied upon return. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t bm, mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, maxahead, maxbehind, reqcount; reqcount = count; /* * Determine the final number of read-behind pages and * allocate them BEFORE releasing the object lock. Otherwise, * there can be a problematic race with vm_object_split(). * Specifically, vm_object_split() might first transfer pages * that precede ma[0] in the current object to a new object, * and then this function incorrectly recreates those pages as * read-behind pages in the current object. */ KASSERT(object->type == OBJT_SWAP, ("%s: object not swappable", __func__)); if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) return (VM_PAGER_FAIL); /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = ma[reqcount - 1]->pindex; msucc = TAILQ_NEXT(ma[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = ma[0]->pindex; mpred = TAILQ_PREV(ma[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } bm = ma[0]; for (i = 0; i < count; i++) ma[i]->oflags |= VPO_SWAPINPROG; /* * Allocate readahead and readbehind pages. */ if (rbehind != NULL) { for (i = 1; i <= *rbehind; i++) { p = vm_page_alloc(object, ma[0]->pindex - i, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; bm = p; } *rbehind = i - 1; } if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); pindex = bm->pindex; blk = swp_pager_meta_lookup(object, pindex); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swrbuf_zone, M_WAITOK); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); bp->b_pages[i] = p; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; VM_CNT_INC(v_swapin); VM_CNT_ADD(v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our ma[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { ma[0]->oflags |= VPO_SWAPSLEEP; VM_CNT_INC(v_intrans); if (VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (ma[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, ma, count, rbehind, rahead); VM_OBJECT_WUNLOCK(object); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, ma, count, error); VM_OBJECT_WLOCK(object); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whose rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { struct buf *bp; daddr_t addr, blk, n_free, s_free; vm_page_t mreq; int i, j, n; bool async; KASSERT(count == 0 || ma[0]->object == object, ("%s: object mismatch %p/%p", __func__, object, ma[0]->object)); /* * Step 1 * * Turn object into OBJT_SWAP. Force sync if not a pageout process. */ if (object->type != OBJT_SWAP) { addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE); KASSERT(addr == SWAPBLK_NONE, ("unexpected object swap block")); } VM_OBJECT_WUNLOCK(object); async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; swp_pager_init_freerange(&s_free, &n_free); /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { /* Maximum I/O size is limited by maximum swap block size. */ n = min(count - i, nsw_cluster_max); /* Get a block of swap of size up to size n. */ blk = swp_pager_getswapspace(&n, 4); if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) rtvals[i + j] = VM_PAGER_FAIL; continue; } /* * All I/O parameters have been satisfied. Build the I/O * request and assign the swap space. */ if (async) { mtx_lock(&swbuf_mtx); while (nsw_wcount_async == 0) msleep(&nsw_wcount_async, &swbuf_mtx, PVM, "swbufa", 0); nsw_wcount_async--; mtx_unlock(&swbuf_mtx); } bp = uma_zalloc(swwbuf_zone, M_WAITOK); if (async) bp->b_flags = B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_WLOCK(object); for (j = 0; j < n; ++j) { mreq = ma[i + j]; addr = swp_pager_meta_build(mreq->object, mreq->pindex, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_WUNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; VM_CNT_INC(v_swapout); VM_CNT_ADD(v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ if (async) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * Report error - unless we ran out of memory, in which case * we've already logged it in swapgeom_strategy(). */ if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->handle); } if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ vm_page_invalid(m); } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ MPASS(m->dirty == VM_PAGE_BITS_ALL); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); vm_page_valid(m); if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). * A page is only written to swap after a period of * inactivity. Therefore, we do not expect it to be * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); vm_page_lock(m); vm_page_deactivate_noreuse(m); vm_page_unlock(m); vm_page_sunbusy(m); } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ if (bp->b_flags & B_ASYNC) { mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); } uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int swap_pager_nswapdev(void) { return (nswapdev); } static void swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); #ifdef INVARIANTS vm_page_lock(m); if (!vm_page_wired(m) && m->queue == PQ_NONE) panic("page %p is neither wired nor queued", m); vm_page_unlock(m); #endif vm_page_xunbusy(m); swap_pager_unswapped(m); } static void swp_pager_force_launder(vm_page_t m) { vm_page_dirty(m); vm_page_lock(m); vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); swap_pager_unswapped(m); } /* * SWP_PAGER_FORCE_PAGEIN() - force swap blocks to be paged in * * This routine dissociates pages starting at the given index within an * object from their backing store, paging them in if they do not reside * in memory. Pages that are paged in are marked dirty and placed in the * laundry queue. Pages are marked dirty because they no longer have * backing store. They are placed in the laundry queue because they have * not been accessed recently. Otherwise, they would already reside in * memory. */ static void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex, int npages) { vm_page_t ma[npages]; int i, j; KASSERT(npages > 0, ("%s: No pages", __func__)); KASSERT(npages <= MAXPHYS / PAGE_SIZE, ("%s: Too many pages: %d", __func__, npages)); KASSERT(object->type == OBJT_SWAP, ("%s: Object not swappable", __func__)); vm_object_pip_add(object, npages); vm_page_grab_pages(object, pindex, VM_ALLOC_NORMAL, ma, npages); for (i = j = 0;; i++) { /* Count nonresident pages, to page-in all at once. */ if (i < npages && ma[i]->valid != VM_PAGE_BITS_ALL) continue; if (j < i) { /* Page-in nonresident pages. Mark for laundering. */ if (swap_pager_getpages(object, &ma[j], i - j, NULL, NULL) != VM_PAGER_OK) panic("%s: read from swap failed", __func__); do { swp_pager_force_launder(ma[j]); } while (++j < i); } if (i == npages) break; /* Mark dirty a resident page. */ swp_pager_force_dirty(ma[j++]); } vm_object_pip_wakeupn(object, npages); } /* * swap_pager_swapoff_object: * * Page in all of the pages that have been paged out for an object * to a swap device. */ static void swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) { struct swblk *sb; vm_pindex_t pi, s_pindex; daddr_t blk, n_blks, s_blk; int i; KASSERT(object->type == OBJT_SWAP, ("%s: Object not swappable", __func__)); n_blks = 0; for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi)) != NULL; ) { for (i = 0; i < SWAP_META_PAGES; i++) { blk = sb->d[i]; if (!swp_pager_isondev(blk, sp)) blk = SWAPBLK_NONE; /* * If there are no blocks/pages accumulated, start a new * accumulation here. */ if (n_blks == 0) { if (blk != SWAPBLK_NONE) { s_blk = blk; s_pindex = sb->p + i; n_blks = 1; } continue; } /* * If the accumulation can be extended without breaking * the sequence of consecutive blocks and pages that * swp_pager_force_pagein() depends on, do so. */ if (n_blks < MAXPHYS / PAGE_SIZE && s_blk + n_blks == blk && s_pindex + n_blks == sb->p + i) { ++n_blks; continue; } /* * The sequence of consecutive blocks and pages cannot * be extended, so page them all in here. Then, * because doing so involves releasing and reacquiring * a lock that protects the swap block pctrie, do not * rely on the current swap block. Break this loop and * re-fetch the same pindex from the pctrie again. */ swp_pager_force_pagein(object, s_pindex, n_blks); n_blks = 0; break; } if (i == SWAP_META_PAGES) pi = sb->p + SWAP_META_PAGES; } if (n_blks > 0) swp_pager_force_pagein(object, s_pindex, n_blks); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { vm_object_t object; int retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; full_rescan: mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->type != OBJT_SWAP) continue; mtx_unlock(&vm_object_list_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); /* * Dead objects are eventually terminated on their own. */ if ((object->flags & OBJ_DEAD) != 0) goto next_obj; /* * Sync with fences placed after pctrie * initialization. We must not access pctrie below * unless we checked that our object is swap and not * dead. */ atomic_thread_fence_acq(); if (object->type != OBJT_SWAP) goto next_obj; swap_pager_swapoff_object(sp, object); next_obj: VM_OBJECT_WUNLOCK(object); mtx_lock(&vm_object_list_mtx); } mtx_unlock(&vm_object_list_mtx); if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } EVENTHANDLER_INVOKE(swapoff, sp); } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free? */ static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit) { int i; MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES); for (i = start; i < limit; i++) { if (sb->d[i] != SWAPBLK_NONE) return (false); } return (true); } /* * SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free * * Nothing is done if the block is still in use. */ static void swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb) { if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is returned. */ static daddr_t swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; struct swblk *sb, *sb1; vm_pindex_t modpi, rdpi; daddr_t prev_swapblk; int error, i; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff()'s iteration over * object_list does not see a garbage pctrie. */ atomic_thread_fence_rel(); object->type = OBJT_SWAP; object->un_pager.swp.writemappings = 0; KASSERT(object->handle == NULL, ("default pager with handle")); } rdpi = rounddown(pindex, SWAP_META_PAGES); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb == NULL) { if (swapblk == SWAPBLK_NONE) return (SWAPBLK_NONE); for (;;) { sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (sb != NULL) { sb->p = rdpi; for (i = 0; i < SWAP_META_PAGES; i++) sb->d[i] = SWAPBLK_NONE; if (atomic_cmpset_int(&swblk_zone_exhausted, 1, 0)) printf("swblk zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swblk_zone)) { if (atomic_cmpset_int(&swblk_zone_exhausted, 0, 1)) printf("swap blk zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxb", 10); } else uma_zwait(swblk_zone); VM_OBJECT_WLOCK(object); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb != NULL) /* * Somebody swapped out a nearby page, * allocating swblk at the rdpi index, * while we dropped the object lock. */ goto allocated; } for (;;) { error = SWAP_PCTRIE_INSERT( &object->un_pager.swp.swp_blks, sb); if (error == 0) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 1, 0)) printf("swpctrie zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swpctrie_zone)) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 0, 1)) printf("swap pctrie zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxp", 10); } else uma_zwait(swpctrie_zone); VM_OBJECT_WLOCK(object); sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb1 != NULL) { uma_zfree(swblk_zone, sb); sb = sb1; goto allocated; } } } allocated: MPASS(sb->p == rdpi); modpi = pindex % SWAP_META_PAGES; /* Return prior contents of metadata. */ prev_swapblk = sb->d[modpi]; /* Enter block into metadata. */ sb->d[modpi] = swapblk; /* * Free the swblk if we end up with the empty page run. */ if (swapblk == SWAPBLK_NONE) swp_pager_free_empty_swblk(object, sb); return (prev_swapblk); } /* * SWP_PAGER_META_TRANSFER() - free a range of blocks in the srcobject's swap * metadata, or transfer it into dstobject. * * This routine will free swap metadata structures as they are cleaned * out. */ static void swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, vm_pindex_t count) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t offset, last; int i, limit, start; VM_OBJECT_ASSERT_WLOCKED(srcobject); if (srcobject->type != OBJT_SWAP || count == 0) return; swp_pager_init_freerange(&s_free, &n_free); offset = pindex; last = pindex + count; for (;;) { sb = SWAP_PCTRIE_LOOKUP_GE(&srcobject->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL || sb->p >= last) break; start = pindex > sb->p ? pindex - sb->p : 0; limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : SWAP_META_PAGES; for (i = start; i < limit; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; if (dstobject == NULL || !swp_pager_xfer_source(srcobject, dstobject, sb->p + i - offset, sb->d[i])) { swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } sb->d[i] = SWAPBLK_NONE; } pindex = sb->p + SWAP_META_PAGES; if (swp_pager_swblk_empty(sb, 0, start) && swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&srcobject->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) { swp_pager_meta_transfer(object, NULL, pindex, count); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t pindex; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return; swp_pager_init_freerange(&s_free, &n_free); for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pindex)) != NULL;) { pindex = sb->p + SWAP_META_PAGES; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_METACTL() - misc control of swap meta data. * * This routine is capable of looking up, or removing swapblk * assignments in the swap meta data. It returns the swapblk being * looked-up, popped, or SWAPBLK_NONE if the block was invalid. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. */ static daddr_t swp_pager_meta_lookup(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ KASSERT(object->type == OBJT_SWAP, ("Lookup object not swappable")); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (SWAPBLK_NONE); return (sb->d[pindex % SWAP_META_PAGES]); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; int i; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP) return (object->size); sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); if (sb->p < pindex) { for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, roundup(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); } for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } /* * We get here if a swblk is present in the trie but it * doesn't map any blocks. */ MPASS(0); return (object->size); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swblk_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message. */ static void swapon_check_swzone(void) { /* recommend using no more than half that amount */ if (swap_total > swap_maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", swap_total, swap_maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); } } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf( "WARNING: reducing swap size to maximum of %luMB per unit\n", mblocks / 1024 / 1024 * PAGE_SIZE); nblks = mblocks; } sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* * Do not free the first blocks in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE), nblks - howmany(BBSIZE, PAGE_SIZE)); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE); swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { if (vn_isdisk(sp->sw_vp, NULL)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } #if defined(COMPAT_FREEBSD11) #define XSWDEV_VERSION_11 1 struct xswdev11 { u_int xsw_version; uint32_t xsw_dev; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 { u_int xsw_version; u_int xsw_dev1, xsw_dev2; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 xs32; #endif #if defined(COMPAT_FREEBSD11) struct xswdev11 xs11; #endif int error; if (arg2 != 1) /* name length */ return (EINVAL); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_FREEBSD32) if (req->oldlen == sizeof(xs32)) { xs32.xsw_version = XSWDEV_VERSION; xs32.xsw_dev1 = xs.xsw_dev; xs32.xsw_dev2 = xs.xsw_dev >> 32; xs32.xsw_flags = xs.xsw_flags; xs32.xsw_nblks = xs.xsw_nblks; xs32.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs32, sizeof(xs32)); return (error); } #endif #if defined(COMPAT_FREEBSD11) if (req->oldlen == sizeof(xs11)) { xs11.xsw_version = XSWDEV_VERSION_11; xs11.xsw_dev = xs.xsw_dev; /* truncation */ xs11.xsw_flags = xs.xsw_flags; xs11.xsw_nblks = xs.xsw_nblks; xs11.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs11, sizeof(xs11)); return (error); } #endif error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * Count the approximate swap usage in pages for a vmspace. The * shadowed or not yet copied on write swap blocks are not accounted. * The map must be locked. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; struct swblk *sb; vm_pindex_t e, pi; long count; int i; map = &vmspace->vm_map; count = 0; VM_MAP_ENTRY_FOREACH(cur, map) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; object = cur->object.vm_object; if (object == NULL || object->type != OBJT_SWAP) continue; VM_OBJECT_RLOCK(object); if (object->type != OBJT_SWAP) goto unlock; pi = OFF_TO_IDX(cur->offset); e = pi + OFF_TO_IDX(cur->end - cur->start); for (;; pi = sb->p + SWAP_META_PAGES) { sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi); if (sb == NULL || sb->p >= e) break; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->p + i < e && sb->d[i] != SWAPBLK_NONE) count++; } } unlock: VM_OBJECT_RUNLOCK(object); } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; printf("swap_pager: cannot allocate bio\n"); bufdone(bp); return; } bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp, 0); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp, 0); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&swbuf_mtx); return (0); } static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); - KASSERT((object->flags & OBJ_NOSPLIT) != 0, + KASSERT((object->flags & OBJ_ANON) == 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); - KASSERT((object->flags & OBJ_NOSPLIT) != 0, + KASSERT((object->flags & OBJ_ANON) == 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 354868) +++ head/sys/vm/vm_fault.c (revision 354869) @@ -1,1928 +1,1927 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { vm_page_t m; vm_object_t object; vm_pindex_t pindex; vm_page_t first_m; vm_object_t first_object; vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; struct vnode *vp; }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked); static int vm_pfault_oom_attempts = 3; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, &vm_pfault_oom_attempts, 0, "Number of page allocation attempts in page fault handler before it " "triggers OOM handling"); static int vm_pfault_oom_wait = 10; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, &vm_pfault_oom_wait, 0, "Number of seconds to wait for free pages before retrying " "the page fault handler"); static inline void release_page(struct faultstate *fs) { if (fs->m != NULL) { /* * fs->m's object lock might not be held, so the page must be * kept busy until we are done with it. */ vm_page_lock(fs->m); vm_page_deactivate(fs->m); vm_page_unlock(fs->m); vm_page_xunbusy(fs->m); fs->m = NULL; } } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void fault_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); vm_page_free(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void unlock_and_deallocate(struct faultstate *fs) { VM_OBJECT_WUNLOCK(fs->object); fault_deallocate(fs); } static void vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot, vm_prot_t fault_type, int fault_flags, bool excl) { bool need_dirty; if (((prot & VM_PROT_WRITE) == 0 && (fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_OBJECT_ASSERT_LOCKED(m->object); VM_PAGE_OBJECT_BUSY_ASSERT(m); need_dirty = ((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_WIRE) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0; vm_object_set_writeable_dirty(m->object); if (!excl) /* * If two callers of vm_fault_dirty() with excl == * FALSE, one for the map entry with MAP_ENTRY_NOSYNC * flag set, other with flag clear, race, it is * possible for the no-NOSYNC thread to see m->dirty * != 0 and not clear PGA_NOSYNC. Take vm_page lock * around manipulation of PGA_NOSYNC and * vm_page_dirty() call to avoid the race. */ vm_page_lock(m); /* * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) { if (m->dirty == 0) { vm_page_aflag_set(m, PGA_NOSYNC); } } else { vm_page_aflag_clear(m, PGA_NOSYNC); } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also, since the page is now dirty, we can possibly tell * the pager to release any swap backing the page. Calling * the pager requires a write lock on the object. */ if (need_dirty) vm_page_dirty(m); if (!excl) vm_page_unlock(m); else if (need_dirty) vm_pager_page_unswapped(m); } /* * Unlocks fs.first_object and fs.map on success. */ static int vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m, m_map; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 vm_page_t m_super; int flags; #endif int psind, rv; MPASS(fs->vp == NULL); vm_object_busy(fs->first_object); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || !vm_page_all_valid(m)) { rv = KERN_FAILURE; goto out; } m_map = m; psind = 0; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && (m_super = vm_reserv_to_superpage(m)) != NULL && rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & (pagesizes[m_super->psind] - 1)) && !wired && pmap_ps_enabled(fs->map->pmap)) { flags = PS_ALL_VALID; if ((prot & VM_PROT_WRITE) != 0) { /* * Create a superpage mapping allowing write access * only if none of the constituent pages are busy and * all of them are already dirty (except possibly for * the page that was faulted on). */ flags |= PS_NONE_BUSY; if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) flags |= PS_ALL_DIRTY; } if (vm_page_ps_test(m_super, flags, m)) { m_map = m_super; psind = m_super->psind; vaddr = rounddown2(vaddr, pagesizes[psind]); /* Preset the modified bit for dirty superpages. */ if ((flags & PS_ALL_DIRTY) != 0) fault_type |= VM_PROT_WRITE; } } #endif rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type | PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind); if (rv != KERN_SUCCESS) goto out; if (m_hold != NULL) { *m_hold = m; vm_page_wire(m); } vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false); if (psind == 0 && !wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); VM_OBJECT_RUNLOCK(fs->first_object); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; out: vm_object_unbusy(fs->first_object); return (rv); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(REFCOUNT_COUNT(fs->first_object->paging_in_progress) > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(vm_page_all_valid(m)); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_page_lock(m); vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } } static int vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { struct mtx *m_mtx; vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(REFCOUNT_COUNT(fs->first_object->paging_in_progress) > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ return (KERN_NOT_RECEIVER); } if (rv != VM_PAGER_OK) return (KERN_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ } /* * The map is unchanged after our last unlock. Process the fault. * * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || !pmap_ps_enabled(fs->map->pmap) || wired)) psind = 0; #else psind = 0; #endif npages = atop(pagesizes[psind]); for (i = 0; i < npages; i++) { vm_fault_populate_check_page(&m[i]); vm_fault_dirty(fs->entry, &m[i], prot, fault_type, fault_flags, true); } VM_OBJECT_WUNLOCK(fs->first_object); rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), psind); #if defined(__amd64__) if (psind > 0 && rv == KERN_FAILURE) { for (i = 0; i < npages; i++) { rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), &m[i], prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); MPASS(rv == KERN_SUCCESS); } } #else MPASS(rv == KERN_SUCCESS); #endif VM_OBJECT_WLOCK(fs->first_object); m_mtx = NULL; for (i = 0; i < npages; i++) { if ((fault_flags & VM_FAULT_WIRE) != 0) { vm_page_wire(&m[i]); } else { vm_page_change_lock(&m[i], &m_mtx); vm_page_activate(&m[i]); } if (m_hold != NULL && m[i].pindex == fs->first_pindex) { *m_hold = &m[i]; vm_page_wire(&m[i]); } vm_page_xunbusy(&m[i]); } if (m_mtx != NULL) mtx_unlock(m_mtx); } curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Control signal to deliver on protection fault"); /* compat definition to keep common code for signal translation */ #define UCODE_PAGEFLT 12 #ifdef T_PAGEFLT _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT"); #endif /* * vm_fault_trap: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, int *signo, int *ucode) { int result; MPASS(signo == NULL || ucode != NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags, NULL); KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE || result == KERN_INVALID_ADDRESS || result == KERN_RESOURCE_SHORTAGE || result == KERN_PROTECTION_FAILURE || result == KERN_OUT_OF_BOUNDS, ("Unexpected Mach error %d from vm_fault()", result)); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND)) ktrfaultend(result); #endif if (result != KERN_SUCCESS && signo != NULL) { switch (result) { case KERN_FAILURE: case KERN_INVALID_ADDRESS: *signo = SIGSEGV; *ucode = SEGV_MAPERR; break; case KERN_RESOURCE_SHORTAGE: *signo = SIGBUS; *ucode = BUS_OOMERR; break; case KERN_OUT_OF_BOUNDS: *signo = SIGBUS; *ucode = BUS_OBJERR; break; case KERN_PROTECTION_FAILURE: if (prot_fault_translation == 0) { /* * Autodetect. This check also covers * the images without the ABI-tag ELF * note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && curproc->p_osrel >= P_OSREL_SIGSEGV) { *signo = SIGSEGV; *ucode = SEGV_ACCERR; } else { *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } } else if (prot_fault_translation == 1) { /* Always compat mode. */ *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } else { /* Always SIGSEGV mode. */ *signo = SIGSEGV; *ucode = SEGV_ACCERR; } break; default: KASSERT(0, ("Unexpected Mach error %d from vm_fault()", result)); break; } } return (result); } static int vm_fault_lock_vnode(struct faultstate *fs) { struct vnode *vp; int error, locked; if (fs->object->type != OBJT_VNODE) return (KERN_SUCCESS); vp = fs->object->handle; if (vp == fs->vp) { ASSERT_VOP_LOCKED(vp, "saved vnode is not locked"); return (KERN_SUCCESS); } /* * Perform an unlock in case the desired vnode changed while * the map was unlocked during a retry. */ unlock_vp(fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock while we have * the page exclusive busied or the object's * paging-in-progress count incremented. Otherwise, we could * deadlock. */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error == 0) { fs->vp = vp; return (KERN_SUCCESS); } vhold(vp); release_page(fs); unlock_and_deallocate(fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs->vp = vp; KASSERT(error == 0, ("vm_fault: vget failed %d", error)); return (KERN_RESOURCE_SHORTAGE); } int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; struct domainset *dset; vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; vm_pindex_t retry_pindex; vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, era, faultcount; int nera, oom, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, hardfault, is_first_object_locked; VM_CNT_INC(v_vm_faults); if ((curthread->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); fs.vp = NULL; faultcount = 0; nera = -1; hardfault = false; RetryFault: oom = 0; RetryFault_oom: /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type | VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { unlock_vp(&fs); return (result); } fs.map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("%s: fault on nofault entry, addr: %#lx", __func__, (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(&fs); fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0); if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); else KASSERT((fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are mapping an existing page from the top-level object. * Under this condition, a read lock on the object suffices, allowing * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type, fault_flags, wired, m_hold); if (rv == KERN_SUCCESS) return (rv); if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = true; fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is marked for imminent termination, * we retry here, since the collapse pass has raced * with us. Otherwise, if we see terminally dead * object, return fail. */ if ((fs.object->flags & OBJ_DEAD) != 0) { dead = fs.object->type == OBJT_DEAD; unlock_and_deallocate(&fs); if (dead) return (KERN_PROTECTION_FAILURE); pause("vmf_de", 1); goto RetryFault; } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * Wait/Retry if the page is busy. We have to do this * if the page is either exclusive or shared busy * because the vm_pager may be using read busy for * pageouts (and even pageins if it is the vnode * pager), and we could end up trying to pagein and * pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a shared busied page except, perhaps, * to pmap it. */ if (vm_page_tryxbusy(fs.m) == 0) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_free(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } /* * The page is marked busy for other processes and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ if (!vm_page_all_valid(fs.m)) goto readrest; break; /* break to PAGE HAS BEEN FOUND */ } KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m)); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. (Default objects are zero-fill, so there is no real * pager for them.) */ if (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object) { if ((fs.object->flags & OBJ_SIZEVNLOCK) != 0) { rv = vm_fault_lock_vnode(&fs); MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); if (rv == KERN_RESOURCE_SHORTAGE) goto RetryFault; } if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_OUT_OF_BOUNDS); } if (fs.object == fs.first_object && (fs.first_object->flags & OBJ_POPULATE) != 0 && fs.first_object->shadow_count == 0) { rv = vm_fault_populate(&fs, prot, fault_type, fault_flags, wired, m_hold); switch (rv) { case KERN_SUCCESS: case KERN_FAILURE: unlock_and_deallocate(&fs); return (rv); case KERN_RESOURCE_SHORTAGE: unlock_and_deallocate(&fs); goto RetryFault; case KERN_NOT_RECEIVER: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ dset = fs.object->domain.dr_policy; if (dset == NULL) dset = curthread->td_domain.dr_policy; if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs.object, atop(vaddr) - fs.pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); if (vm_pfault_oom_attempts < 0 || oom < vm_pfault_oom_attempts) { oom++; vm_waitpfault(dset, vm_pfault_oom_wait * hz); goto RetryFault_oom; } if (bootverbose) printf( "proc %d (%s) failed to alloc page on fault, starting OOM\n", curproc->p_pid, curproc->p_comm); vm_pageout_oom(VM_OOM_MEM_PF); goto RetryFault; } } readrest: /* * At this point, we have either allocated a new page or found * an existing page that is only partially valid. * * We hold a reference on the current object and the page is * exclusive busied. */ /* * If the pager for the current object might have the page, * then determine the number of additional pages to read and * potentially reprioritize previously read pages for earlier * reclamation. These operations should only be performed * once per page fault. Even if the current pager doesn't * have the page, the number of additional pages to read will * apply to subsequent objects in the shadow chain. */ if (fs.object->type != OBJT_DEFAULT && nera == -1 && !P_KILLED(curproc)) { KASSERT(fs.lookup_still_valid, ("map unlocked")); era = fs.entry->read_ahead; behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (vaddr == fs.entry->next_read) vm_fault_dontneed(&fs, vaddr, nera); } else if (vaddr == fs.entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(&fs, vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs.entry->read_ahead = nera; } /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs.entry->start; e_end = fs.entry->end; } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ if (fs.object->type != OBJT_DEFAULT) { /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(&fs); rv = vm_fault_lock_vnode(&fs); MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); if (rv == KERN_RESOURCE_SHORTAGE) goto RetryFault; KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (nera > 0) { behind = 0; ahead = nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs.pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - vaddr) - 1); } rv = vm_pager_get_pages(fs.object, &fs.m, 1, &behind, &ahead); if (rv == VM_PAGER_OK) { faultcount = behind + 1 + ahead; hardfault = true; break; /* break to PAGE HAS BEEN FOUND */ } if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { if (!vm_page_wired(fs.m)) vm_page_free(fs.m); else vm_page_xunbusy(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return (KERN_OUT_OF_BOUNDS); } /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs.object != fs.first_object) { if (!vm_page_wired(fs.m)) vm_page_free(fs.m); else vm_page_xunbusy(fs.m); fs.m = NULL; } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { VM_CNT_INC(v_ozfod); } VM_CNT_INC(v_zfod); vm_page_valid(fs.m); /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } vm_page_assert_xbusied(fs.m); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ - ((fs.object->type == OBJT_DEFAULT) || - (fs.object->type == OBJT_SWAP)) && + ((fs.object->flags & OBJ_ANON) != 0) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { (void)vm_page_remove(fs.m); vm_page_replace_checked(fs.m, fs.first_object, fs.first_pindex, fs.first_m); vm_page_free(fs.first_m); vm_page_dirty(fs.m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs.m, fs.first_object, fs.object, OFF_TO_IDX( fs.first_object->backing_object_offset)); #endif VM_OBJECT_WUNLOCK(fs.object); fs.first_m = fs.m; fs.m = NULL; VM_CNT_INC(v_cow_optim); } else { VM_OBJECT_WUNLOCK(fs.object); /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); vm_page_valid(fs.first_m); if (wired && (fault_flags & VM_FAULT_WIRE) == 0) { vm_page_wire(fs.first_m); vm_page_unwire(fs.m, PQ_INACTIVE); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); /* * We only try to prefault read-only mappings to the * neighboring pages when this copy-on-write fault is * a hard fault. In other cases, trying to prefault * is typically wasted effort. */ if (faultcount == 0) faultcount = 1; /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); VM_CNT_INC(v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = true; if (fs.map->timestamp != fs.map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; fault_type &= retry_prot; if (prot == 0) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* Reassert because wired may have changed. */ KASSERT(wired || (fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); } } /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; vm_page_assert_xbusied(fs.m); vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 && wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR, false); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fault_flags & VM_FAULT_WIRE) != 0) { vm_page_wire(fs.m); } else { vm_page_lock(fs.m); vm_page_activate(fs.m); vm_page_unlock(fs.m); } if (m_hold != NULL) { *m_hold = fs.m; vm_page_wire(fs.m); } vm_page_xunbusy(fs.m); /* * Unlock everything, and return */ fault_deallocate(&fs); if (hardfault) { VM_CNT_INC(v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object, object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_WLOCKED(object); first_object = fs->first_object; if (first_object != object) { if (!VM_OBJECT_TRYWLOCK(first_object)) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WLOCK(first_object); VM_OBJECT_WLOCK(object); } } /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (!vm_page_all_valid(m) || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. */ vm_page_lock(m); if (!vm_page_inactive(m)) vm_page_deactivate(m); vm_page_unlock(m); } } } if (first_object != object) VM_OBJECT_WUNLOCK(first_object); } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; if (!obj_locked) VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); break; } if (vm_page_all_valid(m) && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); /* * Check for illegal addresses. */ if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. * * If vm_fault_disable_pagefaults() was called, * i.e., TDP_NOFAULTING is set, we must not sleep nor * acquire MD VM locks, which means we must not call * vm_fault(). Some (out of tree) callers mark * too wide a code area with vm_fault_disable_pagefaults() * already, use the VM_PROT_QUICK_NOFAULT flag to request * the proper behaviour explicitly. */ if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && (curthread->td_pflags & TDP_NOFAULTING) != 0) goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) vm_page_unwire(*mp, PQ_INACTIVE); return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ - dst_object = vm_object_allocate(OBJT_DEFAULT, + dst_object = vm_object_allocate_anon( atop(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif dst_object->domain = src_object->domain; dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if ((dst_object->type == OBJT_DEFAULT || dst_object->type == OBJT_SWAP) && dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object doesn't share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); vm_wait(dst_object); VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->dirty = dst_m->valid = src_m->valid; } else { dst_m = src_m; if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0) goto again; if (dst_m->pindex >= dst_object->size) { /* * We are upgrading. Index can occur * out of bounds if the object type is * vnode and the file was truncated. */ vm_page_xunbusy(dst_m); break; } } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. * * The page can be invalid if the user called * msync(MS_INVALIDATE) or truncated the backing vnode * or shared memory object. In this case, do not * insert it into pmap, but still do the copy so that * all copies of the wired map entry have similar * backing pages. */ if (vm_page_all_valid(dst_m)) { pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); } /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_unwire(src_m, PQ_INACTIVE); vm_page_wire(dst_m); } else { KASSERT(vm_page_wired(dst_m), ("dst_m %p is not wired", dst_m)); } } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } Index: head/sys/vm/vm_map.c =================================================================== --- head/sys/vm/vm_map.c (revision 354868) +++ head/sys/vm/vm_map.c (revision 354869) @@ -1,4977 +1,4976 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a self-adjusting binary search tree of these * entries is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; static uma_zone_t mapzone; static uma_zone_t vmspace_zone; static int vmspace_zinit(void *mem, int size, int flags); static int vm_map_zinit(void *mem, int ize, int flags); static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max); static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry); static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); #ifdef INVARIANTS static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); #endif static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow); static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr); #define ENTRY_CHARGED(e) ((e)->cred != NULL || \ ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. */ #define PROC_VMSPACE_LOCK(p) do { } while (0) #define PROC_VMSPACE_UNLOCK(p) do { } while (0) /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, #ifdef INVARIANTS vm_map_zdtor, #else NULL, #endif vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_prealloc(mapzone, MAX_KMAP); kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } static int vmspace_zinit(void *mem, int size, int flags) { struct vmspace *vm; vm = (struct vmspace *)mem; vm->vm_map.pmap = NULL; (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags); PMAP_LOCK_INIT(vmspace_pmap(vm)); return (0); } static int vm_map_zinit(void *mem, int size, int flags) { vm_map_t map; map = (vm_map_t)mem; memset(map, 0, sizeof(*map)); mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); return (0); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); } static void vm_map_zdtor(void *mem, int size, void *arg) { vm_map_t map; map = (vm_map_t)mem; KASSERT(map->nentries == 0, ("map %p nentries == %d on free.", map, map->nentries)); KASSERT(map->size == 0, ("map %p size == %lu on free.", map, (unsigned long)map->size)); } #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. * * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit(). */ struct vmspace * vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); if (!pinit(vmspace_pmap(vm))) { uma_zfree(vmspace_zone, vm); return (NULL); } CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); vm->vm_refcnt = 1; vm->vm_shm = NULL; vm->vm_swrss = 0; vm->vm_tsize = 0; vm->vm_dsize = 0; vm->vm_ssize = 0; vm->vm_taddr = 0; vm->vm_daddr = 0; vm->vm_maxsaddr = 0; return (vm); } #ifdef RACCT static void vmspace_container_reset(struct proc *p) { PROC_LOCK(p); racct_set(p, RACCT_DATA, 0); racct_set(p, RACCT_STACK, 0); racct_set(p, RACCT_RSS, 0); racct_set(p, RACCT_MEMLOCK, 0); racct_set(p, RACCT_VMEM, 0); PROC_UNLOCK(p); } #endif static inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); pmap_release(vmspace_pmap(vm)); vm->vm_map.pmap = NULL; uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmspace_free() called"); if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; p->p_vmspace = NULL; PROC_VMSPACE_UNLOCK(p); KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } void vmspace_exit(struct thread *td) { int refcnt; struct vmspace *vm; struct proc *p; /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * * The last exiting process to reach this point releases as * much of the environment as it can. vmspace_dofree() is the * slower fallback in case another process had a temporary * reference to the vmspace. */ p = td->td_proc; vm = p->p_vmspace; atomic_add_int(&vmspace0.vm_refcnt, 1); refcnt = vm->vm_refcnt; do { if (refcnt > 1 && p->p_vmspace != &vmspace0) { /* Switch now since other proc might free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1)); if (refcnt == 1) { if (p->p_vmspace != vm) { /* vmspace not yet freed, switch back */ PROC_VMSPACE_LOCK(p); p->p_vmspace = vm; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } pmap_remove_pages(vmspace_pmap(vm)); /* Switch now since this proc will free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); vmspace_dofree(vm); } #ifdef RACCT if (racct_enable) vmspace_container_reset(p); #endif } /* Acquire reference to vmspace owned by another process. */ struct vmspace * vmspace_acquire_ref(struct proc *p) { struct vmspace *vm; int refcnt; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; if (vm == NULL) { PROC_VMSPACE_UNLOCK(p); return (NULL); } refcnt = vm->vm_refcnt; do { if (refcnt <= 0) { /* Avoid 0->1 transition */ PROC_VMSPACE_UNLOCK(p); return (NULL); } } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1)); if (vm != p->p_vmspace) { PROC_VMSPACE_UNLOCK(p); vmspace_free(vm); return (NULL); } PROC_VMSPACE_UNLOCK(p); return (vm); } /* * Switch between vmspaces in an AIO kernel process. * * The new vmspace is either the vmspace of a user process obtained * from an active AIO request or the initial vmspace of the AIO kernel * process (when it is idling). Because user processes will block to * drain any active AIO requests before proceeding in exit() or * execve(), the reference count for vmspaces from AIO requests can * never be 0. Similarly, AIO kernel processes hold an extra * reference on their initial vmspace for the life of the process. As * a result, the 'newvm' vmspace always has a non-zero reference * count. This permits an additional reference on 'newvm' to be * acquired via a simple atomic increment rather than the loop in * vmspace_acquire_ref() above. */ void vmspace_switch_aio(struct vmspace *newvm) { struct vmspace *oldvm; /* XXX: Need some way to assert that this is an aio daemon. */ KASSERT(newvm->vm_refcnt > 0, ("vmspace_switch_aio: newvm unreferenced")); oldvm = curproc->p_vmspace; if (oldvm == newvm) return; /* * Point to the new address space and refer to it. */ curproc->p_vmspace = newvm; atomic_add_int(&newvm->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(curthread); vmspace_free(oldvm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_xlock_(&map->lock, file, line); map->timestamp++; } void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add) { vm_object_t object, object1; struct vnode *vp; if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0) return; KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with execs")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for text, entry %p", entry)); VM_OBJECT_RLOCK(object); while ((object1 = object->backing_object) != NULL) { VM_OBJECT_RLOCK(object1); VM_OBJECT_RUNLOCK(object); object = object1; } vp = NULL; if (object->type == OBJT_DEAD) { /* * For OBJT_DEAD objects, v_writecount was handled in * vnode_pager_dealloc(). */ } else if (object->type == OBJT_VNODE) { vp = object->handle; } else if (object->type == OBJT_SWAP) { KASSERT((object->flags & OBJ_TMPFS_NODE) != 0, ("vm_map_entry_set_vnode_text: swap and !TMPFS " "entry %p, object %p, add %d", entry, object, add)); /* * Tmpfs VREG node, which was reclaimed, has * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS. In * this case there is no v_writecount to adjust. */ if ((object->flags & OBJ_TMPFS) != 0) vp = object->un_pager.swp.swp_tmpfs; } else { KASSERT(0, ("vm_map_entry_set_vnode_text: wrong object type, " "entry %p, object %p, add %d", entry, object, add)); } if (vp != NULL) { if (add) { VOP_SET_TEXT_CHECKED(vp); VM_OBJECT_RUNLOCK(object); } else { vhold(vp); VM_OBJECT_RUNLOCK(object); vn_lock(vp, LK_SHARED | LK_RETRY); VOP_UNSET_TEXT_CHECKED(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } } else { VM_OBJECT_RUNLOCK(object); } } /* * Use a different name for this vm_map_entry field when it's use * is not consistent with its use as part of an ordered search tree. */ #define defer_next right static void vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry, next; vm_object_t object; td = curthread; entry = td->td_map_def_user; td->td_map_def_user = NULL; while (entry != NULL) { next = entry->defer_next; MPASS((entry->eflags & (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)); if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) { /* * Decrement the object's writemappings and * possibly the vnode's v_writecount. */ KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with writecount")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for writecount")); vm_pager_release_writecount(object, entry->start, entry->end); } vm_map_entry_set_vnode_text(entry, false); vm_map_entry_deallocate(entry, FALSE); entry = next; } } #ifdef INVARIANTS static void _vm_map_assert_locked(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_assert_(&map->system_mtx, MA_OWNED, file, line); else sx_assert_(&map->lock, SA_XLOCKED, file, line); } #define VM_MAP_ASSERT_LOCKED(map) \ _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL }; #ifdef DIAGNOSTIC static int enable_vmmap_check = VMMAP_CHECK_UNLOCK; #else static int enable_vmmap_check = VMMAP_CHECK_NONE; #endif SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN, &enable_vmmap_check, 0, "Enable vm map consistency checking"); static void _vm_map_assert_consistent(vm_map_t map, int check); #define VM_MAP_ASSERT_CONSISTENT(map) \ _vm_map_assert_consistent(map, VMMAP_CHECK_ALL) #ifdef DIAGNOSTIC #define VM_MAP_UNLOCK_CONSISTENT(map) do { \ if (map->nupdates > map->nentries) { \ _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \ map->nupdates = 0; \ } \ } while (0) #else #define VM_MAP_UNLOCK_CONSISTENT(map) #endif #else #define VM_MAP_ASSERT_LOCKED(map) #define VM_MAP_ASSERT_CONSISTENT(map) #define VM_MAP_UNLOCK_CONSISTENT(map) #endif /* INVARIANTS */ void _vm_map_unlock(vm_map_t map, const char *file, int line) { VM_MAP_UNLOCK_CONSISTENT(map); if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_xunlock_(&map->lock, file, line); vm_map_process_deferred(); } } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_slock_(&map->lock, file, line); } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); } } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_xlock_(&map->lock, file, line); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_slock_(&map->lock, file, line); return (error == 0); } /* * _vm_map_lock_upgrade: [ internal use only ] * * Tries to upgrade a read (shared) lock on the specified map to a write * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a * non-zero value if the upgrade fails. If the upgrade fails, the map is * returned without a read or write lock held. * * Requires that the map be read locked. */ int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { unsigned int last_timestamp; if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { if (!sx_try_upgrade_(&map->lock, file, line)) { last_timestamp = map->timestamp; sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); /* * If the map's timestamp does not change while the * map is unlocked, then the upgrade succeeds. */ sx_xlock_(&map->lock, file, line); if (last_timestamp != map->timestamp) { sx_xunlock_(&map->lock, file, line); return (1); } } } map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { VM_MAP_UNLOCK_CONSISTENT(map); sx_downgrade_(&map->lock, file, line); } } /* * vm_map_locked: * * Returns a non-zero value if the caller holds a write (exclusive) lock * on the specified map and the value "0" otherwise. */ int vm_map_locked(vm_map_t map) { if (map->system_map) return (mtx_owned(&map->system_mtx)); else return (sx_xlocked(&map->lock)); } /* * _vm_map_unlock_and_wait: * * Atomically releases the lock on the specified map and puts the calling * thread to sleep. The calling thread will remain asleep until either * vm_map_wakeup() is performed on the map or the specified timeout is * exceeded. * * WARNING! This function does not perform deferred deallocations of * objects and map entries. Therefore, the calling thread is expected to * reacquire the map lock after reawakening and later perform an ordinary * unlock operation, such as vm_map_unlock(), before completing its * operation on the map. */ int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) { VM_MAP_UNLOCK_CONSISTENT(map); mtx_lock(&map_sleep_mtx); if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else sx_xunlock_(&map->lock, file, line); return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", timo)); } /* * vm_map_wakeup: * * Awaken any threads that have slept on the map using * vm_map_unlock_and_wait(). */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the map unlock * and the msleep() in _vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } void vm_map_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); map->busy++; } void vm_map_unbusy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); KASSERT(map->busy, ("vm_map_unbusy: not busy")); if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); wakeup(&map->busy); } } void vm_map_wait_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); while (map->busy) { vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); if (map->system_map) msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); else sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); } map->timestamp++; } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ vm_map_t vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) { vm_map_t result; result = uma_zalloc(mapzone, M_WAITOK); CTR1(KTR_VM, "vm_map_create: %p", result); _vm_map_init(result, pmap, min, max); return (result); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. */ static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { map->header.next = map->header.prev = &map->header; map->header.eflags = MAP_ENTRY_HEADER; map->needs_wakeup = FALSE; map->system_map = 0; map->pmap = pmap; map->header.end = min; map->header.start = max; map->flags = 0; map->root = NULL; map->timestamp = 0; map->busy = 0; map->anon_loc = 0; #ifdef DIAGNOSTIC map->nupdates = 0; #endif } void vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, pmap, min, max); mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "user map"); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; if (map->system_map) new_entry = uma_zalloc(kmapentzone, M_NOWAIT); else new_entry = uma_zalloc(mapentzone, M_WAITOK); if (new_entry == NULL) panic("vm_map_entry_create: kernel resources exhausted"); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_max_free_{left,right}: * * Compute the size of the largest free gap between two entries, * one the root of a tree and the other the ancestor of that root * that is the least or greatest ancestor found on the search path. */ static inline vm_size_t vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor) { return (root->left != NULL ? root->left->max_free : root->start - left_ancestor->end); } static inline vm_size_t vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor) { return (root->right != NULL ? root->right->max_free : right_ancestor->start - root->end); } #define SPLAY_LEFT_STEP(root, y, rlist, test) do { \ vm_size_t max_free; \ \ /* \ * Infer root->right->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look right to find it. \ */ \ y = root->left; \ max_free = root->max_free; \ KASSERT(max_free >= vm_map_entry_max_free_right(root, rlist), \ ("%s: max_free invariant fails", __func__)); \ if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free) \ max_free = vm_map_entry_max_free_right(root, rlist); \ if (y != NULL && (test)) { \ /* Rotate right and make y root. */ \ root->left = y->right; \ y->right = root; \ if (max_free < y->max_free) \ root->max_free = max_free = MAX(max_free, \ vm_map_entry_max_free_left(root, y)); \ root = y; \ y = root->left; \ } \ /* Copy right->max_free. Put root on rlist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \ ("%s: max_free not copied from right", __func__)); \ root->left = rlist; \ rlist = root; \ root = y; \ } while (0) #define SPLAY_RIGHT_STEP(root, y, llist, test) do { \ vm_size_t max_free; \ \ /* \ * Infer root->left->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look left to find it. \ */ \ y = root->right; \ max_free = root->max_free; \ KASSERT(max_free >= vm_map_entry_max_free_left(root, llist), \ ("%s: max_free invariant fails", __func__)); \ if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free) \ max_free = vm_map_entry_max_free_left(root, llist); \ if (y != NULL && (test)) { \ /* Rotate left and make y root. */ \ root->right = y->left; \ y->left = root; \ if (max_free < y->max_free) \ root->max_free = max_free = MAX(max_free, \ vm_map_entry_max_free_right(root, y)); \ root = y; \ y = root->right; \ } \ /* Copy left->max_free. Put root on llist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \ ("%s: max_free not copied from left", __func__)); \ root->right = llist; \ llist = root; \ root = y; \ } while (0) /* * Walk down the tree until we find addr or a NULL pointer where addr would go, * breaking off left and right subtrees of nodes less than, or greater than * addr. Treat pointers to nodes with max_free < length as NULL pointers. * llist and rlist are the two sides in reverse order (bottom-up), with llist * linked by the right pointer and rlist linked by the left pointer in the * vm_map_entry, and both lists terminated by &map->header. This function, and * the subsequent call to vm_map_splay_merge, rely on the start and end address * values in &map->header. */ static vm_map_entry_t vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length, vm_map_entry_t *out_llist, vm_map_entry_t *out_rlist) { vm_map_entry_t llist, rlist, root, y; llist = rlist = &map->header; root = map->root; while (root != NULL && root->max_free >= length) { KASSERT(llist->end <= root->start && root->end <= rlist->start, ("%s: root not within tree bounds", __func__)); if (addr < root->start) { SPLAY_LEFT_STEP(root, y, rlist, y->max_free >= length && addr < y->start); } else if (addr >= root->end) { SPLAY_RIGHT_STEP(root, y, llist, y->max_free >= length && addr >= y->end); } else break; } *out_llist = llist; *out_rlist = rlist; return (root); } static void vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *iolist) { vm_map_entry_t rlist, y; root = root->right; rlist = *iolist; while (root != NULL) SPLAY_LEFT_STEP(root, y, rlist, true); *iolist = rlist; } static void vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *iolist) { vm_map_entry_t llist, y; root = root->left; llist = *iolist; while (root != NULL) SPLAY_RIGHT_STEP(root, y, llist, true); *iolist = llist; } static inline void vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b) { vm_map_entry_t tmp; tmp = *b; *b = *a; *a = tmp; } /* * Walk back up the two spines, flip the pointers and set max_free. The * subtrees of the root go at the bottom of llist and rlist. */ static void vm_map_splay_merge(vm_map_t map, vm_map_entry_t root, vm_map_entry_t llist, vm_map_entry_t rlist) { vm_map_entry_t prev; vm_size_t max_free_left, max_free_right; max_free_left = vm_map_entry_max_free_left(root, llist); if (llist != &map->header) { prev = root->left; do { /* * The max_free values of the children of llist are in * llist->max_free and max_free_left. Update with the * max value. */ llist->max_free = max_free_left = MAX(llist->max_free, max_free_left); vm_map_entry_swap(&llist->right, &prev); vm_map_entry_swap(&prev, &llist); } while (llist != &map->header); root->left = prev; } max_free_right = vm_map_entry_max_free_right(root, rlist); if (rlist != &map->header) { prev = root->right; do { /* * The max_free values of the children of rlist are in * rlist->max_free and max_free_right. Update with the * max value. */ rlist->max_free = max_free_right = MAX(rlist->max_free, max_free_right); vm_map_entry_swap(&rlist->left, &prev); vm_map_entry_swap(&prev, &rlist); } while (rlist != &map->header); root->right = prev; } root->max_free = MAX(max_free_left, max_free_right); map->root = root; #ifdef DIAGNOSTIC ++map->nupdates; #endif } /* * vm_map_splay: * * The Sleator and Tarjan top-down splay algorithm with the * following variation. Max_free must be computed bottom-up, so * on the downward pass, maintain the left and right spines in * reverse order. Then, make a second pass up each side to fix * the pointers and compute max_free. The time bound is O(log n) * amortized. * * The new root is the vm_map_entry containing "addr", or else an * adjacent entry (lower if possible) if addr is not in the tree. * * The map must be locked, and leaves it so. * * Returns: the new root. */ static vm_map_entry_t vm_map_splay(vm_map_t map, vm_offset_t addr) { vm_map_entry_t llist, rlist, root; root = vm_map_splay_split(map, addr, 0, &llist, &rlist); if (root != NULL) { /* do nothing */ } else if (llist != &map->header) { /* * Recover the greatest node in the left * subtree and make it the root. */ root = llist; llist = root->right; root->right = NULL; } else if (rlist != &map->header) { /* * Recover the least node in the right * subtree and make it the root. */ root = rlist; rlist = root->left; root->left = NULL; } else { /* There is no root. */ return (NULL); } vm_map_splay_merge(map, root, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t llist, rlist, root; CTR3(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p", map, map->nentries, entry); VM_MAP_ASSERT_LOCKED(map); map->nentries++; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root == NULL, ("vm_map_entry_link: link object already mapped")); entry->prev = llist; entry->next = rlist; llist->next = rlist->prev = entry; entry->left = entry->right = NULL; vm_map_splay_merge(map, entry, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); } enum unlink_merge_type { UNLINK_MERGE_NONE, UNLINK_MERGE_NEXT }; static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry, enum unlink_merge_type op) { vm_map_entry_t llist, rlist, root, y; VM_MAP_ASSERT_LOCKED(map); root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("vm_map_entry_unlink: unlink object not mapped")); vm_map_splay_findnext(root, &rlist); switch (op) { case UNLINK_MERGE_NEXT: rlist->start = root->start; rlist->offset = root->offset; y = root->left; root = rlist; rlist = root->left; root->left = y; break; case UNLINK_MERGE_NONE: vm_map_splay_findprev(root, &llist); if (llist != &map->header) { root = llist; llist = root->right; root->right = NULL; } else if (rlist != &map->header) { root = rlist; rlist = root->left; root->left = NULL; } else root = NULL; break; } y = entry->next; y->prev = entry->prev; y->prev->next = y; if (root != NULL) vm_map_splay_merge(map, root, llist, rlist); else map->root = NULL; VM_MAP_ASSERT_CONSISTENT(map); map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_entry_resize: * * Resize a vm_map_entry, recompute the amount of free space that * follows it and propagate that value up the tree. * * The map must be locked, and leaves it so. */ static void vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount) { vm_map_entry_t llist, rlist, root; VM_MAP_ASSERT_LOCKED(map); root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("%s: resize object not mapped", __func__)); vm_map_splay_findnext(root, &rlist); root->right = NULL; entry->end += grow_amount; vm_map_splay_merge(map, root, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p", __func__, map, map->nentries, entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur, lbound; boolean_t locked; /* * If the map is empty, then the map entry immediately preceding * "address" is the map's header. */ cur = map->root; if (cur == NULL) { *entry = &map->header; return (FALSE); } if (address >= cur->start && cur->end > address) { *entry = cur; return (TRUE); } if ((locked = vm_map_locked(map)) || sx_try_upgrade(&map->lock)) { /* * Splay requires a write lock on the map. However, it only * restructures the binary search tree; it does not otherwise * change the map. Thus, the map's timestamp need not change * on a temporary upgrade. */ cur = vm_map_splay(map, address); if (!locked) { VM_MAP_UNLOCK_CONSISTENT(map); sx_downgrade(&map->lock); } /* * If "address" is contained within a map entry, the new root * is that map entry. Otherwise, the new root is a map entry * immediately before or after "address". */ if (address < cur->start) { *entry = &map->header; return (FALSE); } *entry = cur; return (address < cur->end); } /* * Since the map is only locked for read access, perform a * standard binary search tree lookup for "address". */ lbound = &map->header; do { if (address < cur->start) { cur = cur->left; } else if (cur->end <= address) { lbound = cur; cur = cur->right; } else { *entry = cur; return (TRUE); } } while (cur != NULL); *entry = lbound; return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; VM_MAP_ASSERT_LOCKED(map); KASSERT(object != kernel_object || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kernel object and COW")); KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, ("vm_map_insert: paradoxical MAP_NOFAULT request")); KASSERT((prot & ~max) == 0, ("prot %#x is not subset of max_prot %#x", prot, max)); /* * Check that the start and end points are not bogus. */ if (start < vm_map_min(map) || end > vm_map_max(map) || start >= end) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &prev_entry)) return (KERN_NO_SPACE); /* * Assert that the next entry doesn't overlap the end point. */ if (vm_map_entry_succ(prev_entry)->start < end) return (KERN_NO_SPACE); if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || max != VM_PROT_NONE)) return (KERN_INVALID_ARGUMENT); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) protoeflags |= MAP_ENTRY_NOFAULT; if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (cow & MAP_STACK_GROWS_DOWN) protoeflags |= MAP_ENTRY_GROWS_DOWN; if (cow & MAP_STACK_GROWS_UP) protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_WRITECOUNT) protoeflags |= MAP_ENTRY_WRITECNT; if (cow & MAP_VN_EXEC) protoeflags |= MAP_ENTRY_VN_EXEC; if ((cow & MAP_CREATE_GUARD) != 0) protoeflags |= MAP_ENTRY_GUARD; if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_DN; if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_UP; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; cred = NULL; if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) goto charged; if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) return (KERN_RESOURCE_SHORTAGE); KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || object->cred == NULL, ("overcommit: vm_map_insert o %p", object)); cred = curthread->td_ucred; } charged: /* Expand the kernel pmap, if necessary. */ if (map == kernel_map && end > kernel_vm_end) pmap_growkernel(end); if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ - VM_OBJECT_WLOCK(object); - if (object->ref_count > 1 || object->shadow_count != 0) - vm_object_clear_flag(object, OBJ_ONEMAPPING); - VM_OBJECT_WUNLOCK(object); + if ((object->flags & OBJ_ANON) != 0) { + VM_OBJECT_WLOCK(object); + if (object->ref_count > 1 || object->shadow_count != 0) + vm_object_clear_flag(object, OBJ_ONEMAPPING); + VM_OBJECT_WUNLOCK(object); + } } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) == protoeflags && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP | MAP_VN_EXEC)) == 0 && prev_entry->end == start && (prev_entry->cred == cred || (prev_entry->object.vm_object != NULL && prev_entry->object.vm_object->cred == cred)) && vm_object_coalesce(prev_entry->object.vm_object, prev_entry->offset, (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end), cred != NULL && (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if (prev_entry->inheritance == inheritance && prev_entry->protection == prot && prev_entry->max_protection == max && prev_entry->wired_count == 0) { KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) == 0, ("prev_entry %p has incoherent wiring", prev_entry)); if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += end - prev_entry->end; vm_map_entry_resize(map, prev_entry, end - prev_entry->end); vm_map_try_merge_entries(map, prev_entry, vm_map_entry_succ(prev_entry)); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); if (cred != NULL && object != NULL && object->cred != NULL && !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { /* Object already accounts for this uid. */ cred = NULL; } } if (cred != NULL) crhold(cred); /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->cred = NULL; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->inheritance = inheritance; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; new_entry->wiring_thread = NULL; new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; new_entry->next_read = start; KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); new_entry->cred = cred; /* * Insert the new entry into the list */ vm_map_entry_link(map, new_entry); if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += new_entry->end - new_entry->start; /* * Try to coalesce the new entry with both the previous and next * entries in the list. Previously, we only attempted to coalesce * with the previous entry when object is NULL. Here, we handle the * other cases, which are less common. */ vm_map_try_merge_entries(map, prev_entry, new_entry); vm_map_try_merge_entries(map, new_entry, vm_map_entry_succ(new_entry)); if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * vm_map_findspace: * * Find the first fit (lowest VM address) for "length" free bytes * beginning at address >= start in the given map. * * In a vm_map_entry, "max_free" is the maximum amount of * contiguous free space between an entry in its subtree and a * neighbor of that entry. This allows finding a free region in * one path down the tree, so O(log n) amortized with splay * trees. * * The map must be locked, and leaves it so. * * Returns: starting address if sufficient space, * vm_map_max(map)-length+1 if insufficient space. */ vm_offset_t vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length) { vm_map_entry_t llist, rlist, root, y; vm_size_t left_length; vm_offset_t gap_end; /* * Request must fit within min/max VM address and must avoid * address wrap. */ start = MAX(start, vm_map_min(map)); if (start >= vm_map_max(map) || length > vm_map_max(map) - start) return (vm_map_max(map) - length + 1); /* Empty tree means wide open address space. */ if (map->root == NULL) return (start); /* * After splay_split, if start is within an entry, push it to the start * of the following gap. If rlist is at the end of the gap containing * start, save the end of that gap in gap_end to see if the gap is big * enough; otherwise set gap_end to start skip gap-checking and move * directly to a search of the right subtree. */ root = vm_map_splay_split(map, start, length, &llist, &rlist); gap_end = rlist->start; if (root != NULL) { start = root->end; if (root->right != NULL) gap_end = start; } else if (rlist != &map->header) { root = rlist; rlist = root->left; root->left = NULL; } else { root = llist; llist = root->right; root->right = NULL; } vm_map_splay_merge(map, root, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); if (length <= gap_end - start) return (start); /* With max_free, can immediately tell if no solution. */ if (root->right == NULL || length > root->right->max_free) return (vm_map_max(map) - length + 1); /* * Splay for the least large-enough gap in the right subtree. */ llist = rlist = &map->header; for (left_length = 0;; left_length = vm_map_entry_max_free_left(root, llist)) { if (length <= left_length) SPLAY_LEFT_STEP(root, y, rlist, length <= vm_map_entry_max_free_left(y, llist)); else SPLAY_RIGHT_STEP(root, y, llist, length > vm_map_entry_max_free_left(y, root)); if (root == NULL) break; } root = llist; llist = root->right; root->right = NULL; if (rlist != &map->header) { y = rlist; rlist = y->left; y->left = NULL; vm_map_splay_merge(map, y, &map->header, rlist); y->max_free = MAX( vm_map_entry_max_free_left(y, root), vm_map_entry_max_free_right(y, &map->header)); root->right = y; } vm_map_splay_merge(map, root, llist, &map->header); VM_MAP_ASSERT_CONSISTENT(map); return (root->end); } int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t end; int result; end = start + length; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if ((cow & MAP_CHECK_EXCL) == 0) vm_map_delete(map, start, end); if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); } else { result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } vm_map_unlock(map); return (result); } static const int aslr_pages_rnd_64[2] = {0x1000, 0x10}; static const int aslr_pages_rnd_32[2] = {0x100, 0x4}; static int cluster_anon = 1; SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW, &cluster_anon, 0, "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always"); static bool clustering_anon_allowed(vm_offset_t addr) { switch (cluster_anon) { case 0: return (false); case 1: return (addr == 0); case 2: default: return (true); } } static long aslr_restarts; SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD, &aslr_restarts, 0, "Number of aslr failures"); #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) /* * Searches for the specified amount of free space in the given map with the * specified alignment. Performs an address-ordered, first-fit search from * the given address "*addr", with an optional upper bound "max_addr". If the * parameter "alignment" is zero, then the alignment is computed from the * given (object, offset) pair so as to enable the greatest possible use of * superpage mappings. Returns KERN_SUCCESS and the address of the free space * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE. * * The map must be locked. Initially, there must be at least "length" bytes * of free space at the given address. */ static int vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment) { vm_offset_t aligned_addr, free_addr; VM_MAP_ASSERT_LOCKED(map); free_addr = *addr; KASSERT(free_addr == vm_map_findspace(map, free_addr, length), ("caller failed to provide space %#jx at address %p", (uintmax_t)length, (void *)free_addr)); for (;;) { /* * At the start of every iteration, the free space at address * "*addr" is at least "length" bytes. */ if (alignment == 0) pmap_align_superpage(object, offset, addr, length); else if ((*addr & (alignment - 1)) != 0) { *addr &= ~(alignment - 1); *addr += alignment; } aligned_addr = *addr; if (aligned_addr == free_addr) { /* * Alignment did not change "*addr", so "*addr" must * still provide sufficient free space. */ return (KERN_SUCCESS); } /* * Test for address wrap on "*addr". A wrapped "*addr" could * be a valid address, in which case vm_map_findspace() cannot * be relied upon to fail. */ if (aligned_addr < free_addr) return (KERN_NO_SPACE); *addr = vm_map_findspace(map, aligned_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) return (KERN_NO_SPACE); free_addr = *addr; if (free_addr == aligned_addr) { /* * If a successful call to vm_map_findspace() did not * change "*addr", then "*addr" must still be aligned * and provide sufficient free space. */ return (KERN_SUCCESS); } } } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t alignment, curr_min_addr, min_addr; int gap, pidx, rv, try; bool cluster, en_aslr, update_anon; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_find: non-NULL backing object for stack")); MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || (object->flags & OBJ_COLORED) == 0)) find_space = VMFS_ANY_SPACE; if (find_space >> 8 != 0) { KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); alignment = (vm_offset_t)1 << (find_space >> 8); } else alignment = 0; en_aslr = (map->flags & MAP_ASLR) != 0; update_anon = cluster = clustering_anon_allowed(*addr) && (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 && find_space != VMFS_NO_SPACE && object == NULL && (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP | MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE; curr_min_addr = min_addr = *addr; if (en_aslr && min_addr == 0 && !cluster && find_space != VMFS_NO_SPACE && (map->flags & MAP_ASLR_IGNSTART) != 0) curr_min_addr = min_addr = vm_map_min(map); try = 0; vm_map_lock(map); if (cluster) { curr_min_addr = map->anon_loc; if (curr_min_addr == 0) cluster = false; } if (find_space != VMFS_NO_SPACE) { KASSERT(find_space == VMFS_ANY_SPACE || find_space == VMFS_OPTIMAL_SPACE || find_space == VMFS_SUPER_SPACE || alignment != 0, ("unexpected VMFS flag")); again: /* * When creating an anonymous mapping, try clustering * with an existing anonymous mapping first. * * We make up to two attempts to find address space * for a given find_space value. The first attempt may * apply randomization or may cluster with an existing * anonymous mapping. If this first attempt fails, * perform a first-fit search of the available address * space. * * If all tries failed, and find_space is * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE. * Again enable clustering and randomization. */ try++; MPASS(try <= 2); if (try == 2) { /* * Second try: we failed either to find a * suitable region for randomizing the * allocation, or to cluster with an existing * mapping. Retry with free run. */ curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ? vm_map_min(map) : min_addr; atomic_add_long(&aslr_restarts, 1); } if (try == 1 && en_aslr && !cluster) { /* * Find space for allocation, including * gap needed for later randomization. */ pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 && (find_space == VMFS_SUPER_SPACE || find_space == VMFS_OPTIMAL_SPACE) ? 1 : 0; gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR && (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ? aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx]; *addr = vm_map_findspace(map, curr_min_addr, length + gap * pagesizes[pidx]); if (*addr + length + gap * pagesizes[pidx] > vm_map_max(map)) goto again; /* And randomize the start address. */ *addr += (arc4random() % gap) * pagesizes[pidx]; if (max_addr != 0 && *addr + length > max_addr) goto again; } else { *addr = vm_map_findspace(map, curr_min_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) { if (cluster) { cluster = false; MPASS(try == 1); goto again; } rv = KERN_NO_SPACE; goto done; } } if (find_space != VMFS_ANY_SPACE && (rv = vm_map_alignspace(map, object, offset, addr, length, max_addr, alignment)) != KERN_SUCCESS) { if (find_space == VMFS_OPTIMAL_SPACE) { find_space = VMFS_ANY_SPACE; curr_min_addr = min_addr; cluster = update_anon; try = 0; goto again; } goto done; } } else if ((cow & MAP_REMAP) != 0) { if (*addr < vm_map_min(map) || *addr + length > vm_map_max(map) || *addr + length <= length) { rv = KERN_INVALID_ADDRESS; goto done; } vm_map_delete(map, *addr, *addr + length); } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, max, cow); } else { rv = vm_map_insert(map, object, offset, *addr, *addr + length, prot, max, cow); } if (rv == KERN_SUCCESS && update_anon) map->anon_loc = *addr + length; done: vm_map_unlock(map); return (rv); } /* * vm_map_find_min() is a variant of vm_map_find() that takes an * additional parameter (min_addr) and treats the given address * (*addr) differently. Specifically, it treats *addr as a hint * and not as the minimum address where the mapping is created. * * This function works in two phases. First, it tries to * allocate above the hint. If that fails and the hint is * greater than min_addr, it performs a second pass, replacing * the hint with min_addr as the minimum address for the * allocation. */ int vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t hint; int rv; hint = *addr; for (;;) { rv = vm_map_find(map, object, offset, addr, length, max_addr, find_space, prot, max, cow); if (rv == KERN_SUCCESS || min_addr >= hint) return (rv); *addr = hint = min_addr; } } /* * A map entry with any of the following flags set must not be merged with * another entry. */ #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \ MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC) static bool vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry) { KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 || (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0, ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable", prev, entry)); return (prev->end == entry->start && prev->object.vm_object == entry->object.vm_object && (prev->object.vm_object == NULL || prev->offset + (prev->end - prev->start) == entry->offset) && prev->eflags == entry->eflags && prev->protection == entry->protection && prev->max_protection == entry->max_protection && prev->inheritance == entry->inheritance && prev->wired_count == entry->wired_count && prev->cred == entry->cred); } static void vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry) { /* * If the backing object is a vnode object, vm_object_deallocate() * calls vrele(). However, vrele() does not lock the vnode because * the vnode has additional references. Thus, the map lock can be * kept without causing a lock-order reversal with the vnode lock. * * Since we count the number of virtual page mappings in * object->un_pager.vnp.writemappings, the writemappings value * should not be adjusted when the entry is disposed of. */ if (entry->object.vm_object != NULL) vm_object_deallocate(entry->object.vm_object); if (entry->cred != NULL) crfree(entry->cred); vm_map_entry_dispose(map, entry); } /* * vm_map_try_merge_entries: * * Compare the given map entry to its predecessor, and merge its precessor * into it if possible. The entry remains valid, and may be extended. * The predecessor may be deleted. * * The map must be locked. */ void vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 && vm_map_mergeable_neighbors(prev, entry)) { vm_map_entry_unlink(map, prev, UNLINK_MERGE_NEXT); vm_map_merged_neighbor_dispose(map, prev); } } /* * vm_map_entry_back: * * Allocate an object to back a map entry. */ static inline void vm_map_entry_back(vm_map_entry_t entry) { vm_object_t object; KASSERT(entry->object.vm_object == NULL, ("map entry %p has backing object", entry)); KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("map entry %p is a submap", entry)); - object = vm_object_allocate(OBJT_DEFAULT, - atop(entry->end - entry->start)); + object = vm_object_allocate_anon(atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; if (entry->cred != NULL) { object->cred = entry->cred; object->charge = entry->end - entry->start; entry->cred = NULL; } } /* * vm_map_entry_charge_object * * If there is no object backing this entry, create one. Otherwise, if * the entry has cred, give it to the backing object. */ static inline void vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("map entry %p is a submap", entry)); if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) vm_map_entry_back(entry); else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: %s: both cred e %p", __func__, entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->end > start && entry->start < start, ("_vm_map_clip_start: invalid clip of entry %p", entry)); /* * Create a backing object now, if none exists, so that more individual * objects won't be created after the map entry is split. */ vm_map_entry_charge_object(map, entry); /* Clone the entry. */ new_entry = vm_map_entry_create(map); *new_entry = *entry; /* * Split off the front portion. Insert the new entry BEFORE this one, * so that this entry has the specified starting address. */ new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; if (new_entry->cred != NULL) crhold(entry->cred); vm_map_entry_link(map, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); vm_map_entry_set_vnode_text(new_entry, true); /* * The object->un_pager.vnp.writemappings for the * object of MAP_ENTRY_WRITECNT type entry shall be * kept as is here. The virtual pages are * re-distributed among the clipped entries, so the sum is * left the same. */ } } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if ((endaddr) < (entry->end)) \ _vm_map_clip_end((map), (entry), (endaddr)); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->start < end && entry->end > end, ("_vm_map_clip_end: invalid clip of entry %p", entry)); /* * Create a backing object now, if none exists, so that more individual * objects won't be created after the map entry is split. */ vm_map_entry_charge_object(map, entry); /* Clone the entry. */ new_entry = vm_map_entry_create(map); *new_entry = *entry; /* * Split off the back portion. Insert the new entry AFTER this one, * so that this entry has the specified ending address. */ new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); if (new_entry->cred != NULL) crhold(entry->cred); vm_map_entry_link(map, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); vm_map_entry_set_vnode_text(new_entry, true); } } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result; result = KERN_INVALID_ARGUMENT; vm_map_lock(submap); submap->flags |= MAP_IS_SUB_MAP; vm_map_unlock(submap); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = vm_map_entry_succ(entry); vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && ((entry->eflags & MAP_ENTRY_COW) == 0) && (entry->object.vm_object == NULL)) { entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } vm_map_unlock(map); if (result != KERN_SUCCESS) { vm_map_lock(submap); submap->flags &= ~MAP_IS_SUB_MAP; vm_map_unlock(submap); } return (result); } /* * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload the specified map's pmap with mappings to the specified * object's memory-resident pages. No further physical pages are * allocated, and no further virtual pages are retrieved from secondary * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within * the specified address range are mapped. */ static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t start; vm_page_t p, p_start; vm_pindex_t mask, psize, threshold, tmpidx; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; VM_OBJECT_RLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { VM_OBJECT_RUNLOCK(object); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_LOCK_DOWNGRADE(object); } psize = atop(size); if (psize + pindex > object->size) { if (object->size < pindex) { VM_OBJECT_RUNLOCK(object); return; } psize = object->size - pindex; } start = 0; p_start = NULL; threshold = MAX_INIT_PT; p = vm_page_find_least(object, pindex); /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && vm_page_count_severe()) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; break; } if (vm_page_all_valid(p)) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; } /* Jump ahead if a superpage mapping is possible. */ if (p->psind > 0 && ((addr + ptoa(tmpidx)) & (pagesizes[p->psind] - 1)) == 0) { mask = atop(pagesizes[p->psind]) - 1; if (tmpidx + mask < psize && vm_page_ps_test(p, PS_ALL_VALID, NULL)) { p += mask; threshold += mask; } } } else if (p_start != NULL) { pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } if (p_start != NULL) pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); VM_OBJECT_RUNLOCK(object); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { vm_map_entry_t current, entry, in_tran; vm_object_t obj; struct ucred *cred; vm_prot_t old_prot; int rv; if (start == end) return (KERN_SUCCESS); again: in_tran = NULL; vm_map_lock(map); /* * Ensure that we are not concurrently wiring pages. vm_map_wire() may * need to fault pages into the map and will drop the map lock while * doing so, and the VM object may end up in an inconsistent state if we * update the protection on the map entry in between faults. */ vm_map_wait_busy(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) entry = vm_map_entry_succ(entry); /* * Make a first pass to check for protection violations. */ for (current = entry; current->start < end; current = vm_map_entry_succ(current)) { if ((current->eflags & MAP_ENTRY_GUARD) != 0) continue; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0) in_tran = current; } /* * Postpone the operation until all in-transition map entries have * stabilized. An in-transition entry might already have its pages * wired and wired_count incremented, but not yet have its * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call * vm_fault_copy_entry() in the final loop below. */ if (in_tran != NULL) { in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(map, 0); goto again; } /* * Before changing the protections, try to reserve swap space for any * private (i.e., copy-on-write) mappings that are transitioning from * read-only to read/write access. If a reservation fails, break out * of this loop early and let the next loop simplify the entries, since * some may now be mergeable. */ rv = KERN_SUCCESS; vm_map_clip_start(map, entry, start); for (current = entry; current->start < end; current = vm_map_entry_succ(current)) { vm_map_clip_end(map, current, end); if (set_max || ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 || ENTRY_CHARGED(current) || (current->eflags & MAP_ENTRY_GUARD) != 0) { continue; } cred = curthread->td_ucred; obj = current->object.vm_object; if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) { if (!swap_reserve(current->end - current->start)) { rv = KERN_RESOURCE_SHORTAGE; end = current->end; break; } crhold(cred); current->cred = cred; continue; } VM_OBJECT_WLOCK(obj); if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { VM_OBJECT_WUNLOCK(obj); continue; } /* * Charge for the whole object allocation now, since * we cannot distinguish between non-charged and * charged clipped mapping of the same object later. */ KASSERT(obj->charge == 0, ("vm_map_protect: object %p overcharged (entry %p)", obj, current)); if (!swap_reserve(ptoa(obj->size))) { VM_OBJECT_WUNLOCK(obj); rv = KERN_RESOURCE_SHORTAGE; end = current->end; break; } crhold(cred); obj->cred = cred; obj->charge = ptoa(obj->size); VM_OBJECT_WUNLOCK(obj); } /* * If enough swap space was available, go back and fix up protections. * Otherwise, just simplify entries, since some may have been modified. * [Note that clipping is not necessary the second time.] */ for (current = entry; current->start < end; vm_map_try_merge_entries(map, vm_map_entry_pred(current), current), current = vm_map_entry_succ(current)) { if (rv != KERN_SUCCESS || (current->eflags & MAP_ENTRY_GUARD) != 0) continue; old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * For user wired map entries, the normal lazy evaluation of * write access upgrades through soft page faults is * undesirable. Instead, immediately copy any pages that are * copy-on-write and enable write access in the physical map. */ if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 && (current->protection & VM_PROT_WRITE) != 0 && (old_prot & VM_PROT_WRITE) == 0) vm_fault_copy_entry(map, map, current, current, NULL); /* * When restricting access, update the physical map. Worry * about copy-on-write here. */ if ((old_prot & ~current->protection) != 0) { #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(current)); #undef MASK } } vm_map_try_merge_entries(map, vm_map_entry_pred(current), current); vm_map_unlock(map); return (rv); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t current, entry; bool modify_map; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: if (start == end) return (0); modify_map = true; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: if (start == end) return (0); modify_map = false; vm_map_lock_read(map); break; default: return (EINVAL); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { if (modify_map) vm_map_clip_start(map, entry, start); } else { entry = vm_map_entry_succ(entry); } if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ for (current = entry; current->start < end; current = vm_map_entry_succ(current)) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; vm_map_clip_end(map, current, end); switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: current->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: current->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: current->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: current->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_try_merge_entries(map, vm_map_entry_pred(current), current); } vm_map_try_merge_entries(map, vm_map_entry_pred(current), current); vm_map_unlock(map); } else { vm_pindex_t pstart, pend; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ for (current = entry; current->start < end; current = vm_map_entry_succ(current)) { vm_offset_t useEnd, useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; /* * MADV_FREE would otherwise rewind time to * the creation of the shadow object. Because * we hold the VM map read-locked, neither the * entry's object nor the presence of a * backing object can change. */ if (behav == MADV_FREE && current->object.vm_object != NULL && current->object.vm_object->backing_object != NULL) continue; pstart = OFF_TO_IDX(current->offset); pend = pstart + atop(current->end - current->start); useStart = current->start; useEnd = current->end; if (current->start < start) { pstart += atop(start - current->start); useStart = start; } if (current->end > end) { pend -= atop(current->end - end); useEnd = end; } if (pstart >= pend) continue; /* * Perform the pmap_advise() before clearing * PGA_REFERENCED in vm_page_advise(). Otherwise, a * concurrent pmap operation, such as pmap_remove(), * could clear a reference in the pmap and set * PGA_REFERENCED on the page before the pmap_advise() * had completed. Consequently, the page would appear * referenced based upon an old reference that * occurred before this pmap_advise() ran. */ if (behav == MADV_DONTNEED || behav == MADV_FREE) pmap_advise(map->pmap, useStart, useEnd, behav); vm_object_madvise(current->object.vm_object, pstart, pend, behav); /* * Pre-populate paging structures in the * WILLNEED case. For wired entries, the * paging structures are already populated. */ if (behav == MADV_WILLNEED && current->wired_count == 0) { vm_map_pmap_enter(map, useStart, current->protection, current->object.vm_object, pstart, ptoa(pend - pstart), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vmspace_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: case VM_INHERIT_ZERO: break; default: return (KERN_INVALID_ARGUMENT); } if (start == end) return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = vm_map_entry_succ(temp_entry); while (entry->start < end) { vm_map_clip_end(map, entry, end); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) entry->inheritance = new_inheritance; vm_map_try_merge_entries(map, vm_map_entry_pred(entry), entry); entry = vm_map_entry_succ(entry); } vm_map_try_merge_entries(map, vm_map_entry_pred(entry), entry); vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_entry_in_transition: * * Release the map lock, and sleep until the entry is no longer in * transition. Awake and acquire the map lock. If the map changed while * another held the lock, lookup a possibly-changed entry at or after the * 'start' position of the old entry. */ static vm_map_entry_t vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start, vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry) { vm_map_entry_t entry; vm_offset_t start; u_int last_timestamp; VM_MAP_ASSERT_LOCKED(map); KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("not in-tranition map entry %p", in_entry)); /* * We have not yet clipped the entry. */ start = MAX(in_start, in_entry->start); in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user wiring/unwiring? */ } vm_map_lock(map); if (last_timestamp + 1 == map->timestamp) return (in_entry); /* * Look again for the entry because the map was modified while it was * unlocked. Specifically, the entry may have been clipped, merged, or * deleted. */ if (!vm_map_lookup_entry(map, start, &entry)) { if (!holes_ok) { *io_end = start; return (NULL); } entry = vm_map_entry_succ(entry); } return (entry); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry; int rv; bool first_iteration, holes_ok, need_wakeup, user_unwire; if (start == end) return (KERN_SUCCESS); holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; user_unwire = (flags & VM_MAP_WIRE_USER) != 0; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (holes_ok) first_entry = vm_map_entry_succ(first_entry); else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } first_iteration = true; entry = first_entry; rv = KERN_SUCCESS; while (entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ entry = vm_map_entry_in_transition(map, start, &end, holes_ok, entry); if (entry == NULL) { if (first_iteration) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } rv = KERN_INVALID_ADDRESS; break; } first_entry = first_iteration ? entry : NULL; continue; } first_iteration = false; vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; /* * Check the map for holes in the specified region. * If holes_ok, skip this check. */ if (!holes_ok && (entry->end < end && vm_map_entry_succ(entry)->start > entry->end)) { end = entry->end; rv = KERN_INVALID_ADDRESS; break; } /* * If system unwiring, require that the entry is system wired. */ if (!user_unwire && vm_map_entry_system_wired_count(entry) == 0) { end = entry->end; rv = KERN_INVALID_ARGUMENT; break; } entry = vm_map_entry_succ(entry); } need_wakeup = false; if (first_entry == NULL && !vm_map_lookup_entry(map, start, &first_entry)) { KASSERT(holes_ok, ("vm_map_unwire: lookup failed")); first_entry = vm_map_entry_succ(first_entry); } for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { /* * If holes_ok was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for draining * MAP_ENTRY_IN_TRANSITION. Moreover, another thread * could be simultaneously wiring this new mapping * entry. Detect these cases and skip any entries * marked as in transition by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT(holes_ok, ("vm_map_unwire: !HOLESOK and new/changed entry")); continue; } if (rv == KERN_SUCCESS && (!user_unwire || (entry->eflags & MAP_ENTRY_USER_WIRED))) { if (entry->wired_count == 1) vm_map_entry_unwire(map, entry); else entry->wired_count--; if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_unwire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_unwire: alien wire %p", entry)); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = true; } vm_map_try_merge_entries(map, vm_map_entry_pred(entry), entry); } vm_map_try_merge_entries(map, vm_map_entry_pred(entry), entry); vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } static void vm_map_wire_user_count_sub(u_long npages) { atomic_subtract_long(&vm_user_wire_count, npages); } static bool vm_map_wire_user_count_add(u_long npages) { u_long wired; wired = vm_user_wire_count; do { if (npages + wired > vm_page_max_user_wired) return (false); } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired, npages + wired)); return (true); } /* * vm_map_wire_entry_failure: * * Handle a wiring failure on the given entry. * * The map should be locked. */ static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && entry->wired_count == 1, ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); KASSERT(failed_addr < entry->end, ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); /* * If any pages at the start of this entry were successfully wired, * then unwire them. */ if (failed_addr > entry->start) { pmap_unwire(map->pmap, entry->start, failed_addr); vm_object_unwire(entry->object.vm_object, entry->offset, failed_addr - entry->start, PQ_ACTIVE); } /* * Assign an out-of-range value to represent the failure to wire this * entry. */ entry->wired_count = -1; } int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { int rv; vm_map_lock(map); rv = vm_map_wire_locked(map, start, end, flags); vm_map_unlock(map); return (rv); } /* * vm_map_wire_locked: * * Implements both kernel and user wiring. Returns with the map locked, * the map lock may be dropped. */ int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t faddr, saved_end, saved_start; u_long npages; u_int last_timestamp; int rv; bool first_iteration, holes_ok, need_wakeup, user_wire; vm_prot_t prot; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); prot = 0; if (flags & VM_MAP_WIRE_WRITE) prot |= VM_PROT_WRITE; holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; user_wire = (flags & VM_MAP_WIRE_USER) != 0; VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (holes_ok) first_entry = vm_map_entry_succ(first_entry); else return (KERN_INVALID_ADDRESS); } first_iteration = true; entry = first_entry; while (entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ entry = vm_map_entry_in_transition(map, start, &end, holes_ok, entry); if (entry == NULL) { if (first_iteration) return (KERN_INVALID_ADDRESS); rv = KERN_INVALID_ADDRESS; goto done; } first_entry = first_iteration ? entry : NULL; continue; } first_iteration = false; vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || (entry->protection & prot) != prot) { entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; if (!holes_ok) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } } else if (entry->wired_count == 0) { entry->wired_count++; npages = atop(entry->end - entry->start); if (user_wire && !vm_map_wire_user_count_add(npages)) { vm_map_wire_entry_failure(map, entry, entry->start); end = entry->end; rv = KERN_RESOURCE_SHORTAGE; goto done; } /* * Release the map lock, relying on the in-transition * mark. Mark the map busy for fork. */ saved_start = entry->start; saved_end = entry->end; last_timestamp = map->timestamp; vm_map_busy(map); vm_map_unlock(map); faddr = saved_start; do { /* * Simulate a fault to get the page and enter * it into the physical map. */ if ((rv = vm_fault(map, faddr, VM_PROT_NONE, VM_FAULT_WIRE, NULL)) != KERN_SUCCESS) break; } while ((faddr += PAGE_SIZE) < saved_end); vm_map_lock(map); vm_map_unbusy(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) KASSERT(false, ("vm_map_wire: lookup failed")); if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; while (entry->end < saved_end) { /* * In case of failure, handle entries * that were not fully wired here; * fully wired entries are handled * later. */ if (rv != KERN_SUCCESS && faddr < entry->end) vm_map_wire_entry_failure(map, entry, faddr); entry = vm_map_entry_succ(entry); } } if (rv != KERN_SUCCESS) { vm_map_wire_entry_failure(map, entry, faddr); if (user_wire) vm_map_wire_user_count_sub(npages); end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If holes_ok was specified, skip this check. */ if (!holes_ok && entry->end < end && vm_map_entry_succ(entry)->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } entry = vm_map_entry_succ(entry); } rv = KERN_SUCCESS; done: need_wakeup = false; if (first_entry == NULL && !vm_map_lookup_entry(map, start, &first_entry)) { KASSERT(holes_ok, ("vm_map_wire: lookup failed")); first_entry = vm_map_entry_succ(first_entry); } for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { /* * If holes_ok was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for faulting in the * pages or draining MAP_ENTRY_IN_TRANSITION. * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition not by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT(holes_ok, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) { /* do nothing */ } else if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { /* * Undo the wiring. Wiring succeeded on this entry * but failed on a later entry. */ if (entry->wired_count == 1) { vm_map_entry_unwire(map, entry); if (user_wire) vm_map_wire_user_count_sub( atop(entry->end - entry->start)); } else entry->wired_count--; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_wire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_wire: alien wire %p", entry)); entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WIRE_SKIPPED); entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = true; } vm_map_try_merge_entries(map, vm_map_entry_pred(entry), entry); } vm_map_try_merge_entries(map, vm_map_entry_pred(entry), entry); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; boolean_t failed; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = entry->start; end = entry->end; } /* * Make a first pass to check for user-wired memory and holes. */ for (current = entry; current->start < end; current = vm_map_entry_succ(current)) { if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && current->end != vm_map_entry_succ(current)->start) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) pmap_remove(map->pmap, start, end); failed = FALSE; /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current->start < end;) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } vm_object_reference(object); last_timestamp = map->timestamp; vm_map_unlock_read(map); if (!vm_object_sync(object, offset, size, syncio, invalidate)) failed = TRUE; start += size; vm_object_deallocate(object); vm_map_lock_read(map); if (last_timestamp == map->timestamp || !vm_map_lookup_entry(map, start, ¤t)) current = vm_map_entry_succ(current); } vm_map_unlock_read(map); return (failed ? KERN_FAILURE : KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { vm_size_t size; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->wired_count > 0, ("vm_map_entry_unwire: entry %p isn't wired", entry)); size = entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) vm_map_wire_user_count_sub(atop(size)); pmap_unwire(map->pmap, entry->start, entry->end); vm_object_unwire(entry->object.vm_object, entry->offset, size, PQ_ACTIVE); entry->wired_count = 0; } static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) vm_object_deallocate(entry->object.vm_object); uma_zfree(system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, count, size1; vm_size_t size; vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE); object = entry->object.vm_object; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { MPASS(entry->cred == NULL); MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); MPASS(object == NULL); vm_map_entry_deallocate(entry, map->system_map); return; } size = entry->end - entry->start; map->size -= size; if (entry->cred != NULL) { swap_release_by_cred(size, entry->cred); crfree(entry->cred); } - if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && - (object != NULL)) { + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) { + entry->object.vm_object = NULL; + } else if ((object->flags & OBJ_ANON) != 0 || + object == kernel_object) { KASSERT(entry->cred == NULL || object->cred == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY), ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); count = atop(size); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_WLOCK(object); - if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT | - OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || + if (object->ref_count != 1 && + ((object->flags & OBJ_ONEMAPPING) != 0 || object == kernel_object)) { vm_object_collapse(object); /* * The option OBJPR_NOTMAPPED can be passed here * because vm_map_delete() already performed * pmap_remove() on the only mapping to this range * of pages. */ vm_object_page_remove(object, offidxstart, offidxend, OBJPR_NOTMAPPED); if (object->type == OBJT_SWAP) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) { size1 = object->size; object->size = offidxstart; if (object->cred != NULL) { size1 -= object->size; KASSERT(object->charge >= ptoa(size1), ("object %p charge < 0", object)); swap_release_by_cred(ptoa(size1), object->cred); object->charge -= ptoa(size1); } } } VM_OBJECT_WUNLOCK(object); - } else - entry->object.vm_object = NULL; + } if (map->system_map) vm_map_entry_deallocate(entry, TRUE); else { entry->defer_next = curthread->td_map_def_user; curthread->td_map_def_user = entry; } } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry; vm_map_entry_t first_entry; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = vm_map_entry_succ(first_entry); else { entry = first_entry; vm_map_clip_start(map, entry, start); } /* * Step through all entries in this region */ while (entry->start < end) { vm_map_entry_t next; /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on * user maps. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && vm_map_entry_system_wired_count(entry) != 0)) { unsigned int last_timestamp; vm_offset_t saved_start; vm_map_entry_t tmp_entry; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, 0); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) entry = vm_map_entry_succ(tmp_entry); else { entry = tmp_entry; vm_map_clip_start(map, entry, saved_start); } } continue; } vm_map_clip_end(map, entry, end); next = vm_map_entry_succ(entry); /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); /* * Remove mappings for the pages, but only if the * mappings could exist. For instance, it does not * make sense to call pmap_remove() for guard entries. */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || entry->object.vm_object != NULL) pmap_remove(map->pmap, entry->start, entry->end); if (entry->end == map->anon_loc) map->anon_loc = entry->start; /* * Delete the entry only after removing all pmap * entries pointing to its pages. (Otherwise, its * page frames may be reallocated, and any modify bits * will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = vm_map_entry_succ(entry); } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_ooffset_t *fork_charge) { vm_object_t src_object; vm_map_entry_t fake_entry; vm_offset_t size; struct ucred *cred; int charged; VM_MAP_ASSERT_LOCKED(dst_map); if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0 || (src_entry->protection & VM_PROT_WRITE) == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && (src_entry->protection & VM_PROT_WRITE) != 0) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } /* * Make a copy of the object. */ size = src_entry->end - src_entry->start; if ((src_object = src_entry->object.vm_object) != NULL) { VM_OBJECT_WLOCK(src_object); charged = ENTRY_CHARGED(src_entry); if (src_object->handle == NULL && - (src_object->type == OBJT_DEFAULT || - src_object->type == OBJT_SWAP)) { + (src_object->flags & OBJ_ANON) != 0) { vm_object_collapse(src_object); - if ((src_object->flags & (OBJ_NOSPLIT | - OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { + if ((src_object->flags & OBJ_ONEMAPPING) != 0) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); if (src_entry->cred != NULL && !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { KASSERT(src_object->cred == NULL, ("OVERCOMMIT: vm_map_copy_entry: cred %p", src_object)); src_object->cred = src_entry->cred; src_object->charge = size; } VM_OBJECT_WUNLOCK(src_object); dst_entry->object.vm_object = src_object; if (charged) { cred = curthread->td_ucred; crhold(cred); dst_entry->cred = cred; *fork_charge += size; if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { crhold(cred); src_entry->cred = cred; *fork_charge += size; } } src_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->offset = src_entry->offset; if (src_entry->eflags & MAP_ENTRY_WRITECNT) { /* * MAP_ENTRY_WRITECNT cannot * indicate write reference from * src_entry, since the entry is * marked as needs copy. Allocate a * fake entry that is used to * decrement object->un_pager writecount * at the appropriate time. Attach * fake_entry to the deferred list. */ fake_entry = vm_map_entry_create(dst_map); fake_entry->eflags = MAP_ENTRY_WRITECNT; src_entry->eflags &= ~MAP_ENTRY_WRITECNT; vm_object_reference(src_object); fake_entry->object.vm_object = src_object; fake_entry->start = src_entry->start; fake_entry->end = src_entry->end; fake_entry->defer_next = curthread->td_map_def_user; curthread->td_map_def_user = fake_entry; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; if (src_entry->cred != NULL) { dst_entry->cred = curthread->td_ucred; crhold(dst_entry->cred); *fork_charge += size; } } } else { /* * We don't want to make writeable wired pages copy-on-write. * Immediately copy these pages into the new map by simulating * page faults. The new pages are pageable. */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, fork_charge); } } /* * vmspace_map_entry_forked: * Update the newly-forked vmspace each time a map entry is inherited * or copied. The values for vm_dsize and vm_tsize are approximate * (and mostly-obsolete ideas in the face of mmap(2) et al.) */ static void vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_map_entry_t entry) { vm_size_t entrysize; vm_offset_t newend; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) return; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { vm2->vm_ssize += btoc(entrysize); } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); vm2->vm_dsize += btoc(newend - entry->start); } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); vm2->vm_tsize += btoc(newend - entry->start); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * XXX It might be worth coalescing the entries added to the new vmspace. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) { struct vmspace *vm2; vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; int error, locked; vm_inherit_t inh; old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), pmap_pinit); if (vm2 == NULL) return (NULL); vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); new_map = &vm2->vm_map; locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); if (error != 0) { sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); vmspace_free(vm2); return (NULL); } new_map->anon_loc = old_map->anon_loc; old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) panic("vm_map_fork: encountered a submap"); inh = old_entry->inheritance; if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && inh != VM_INHERIT_NONE) inh = VM_INHERIT_COPY; switch (inh) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if necessary. */ object = old_entry->object.vm_object; if (object == NULL) { vm_map_entry_back(old_entry); object = old_entry->object.vm_object; } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, old_entry->end - old_entry->start); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; /* Transfer the second reference too. */ vm_object_reference( old_entry->object.vm_object); /* * As in vm_map_merged_neighbor_dispose(), * the vnode lock will not be acquired in * this call to vm_object_deallocate(). */ vm_object_deallocate(object); object = old_entry->object.vm_object; } VM_OBJECT_WLOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); if (old_entry->cred != NULL) { KASSERT(object->cred == NULL, ("vmspace_fork both cred")); object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } /* * Assert the correct state of the vnode * v_writecount while the object is locked, to * not relock it later for the assertion * correctness. */ if (old_entry->eflags & MAP_ENTRY_WRITECNT && object->type == OBJT_VNODE) { KASSERT(((struct vnode *)object->handle)-> v_writecount > 0, ("vmspace_fork: v_writecount %p", object)); KASSERT(object->un_pager.vnp.writemappings > 0, ("vmspace_fork: vnp.writecount %p", object)); } VM_OBJECT_WUNLOCK(object); /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; if (new_entry->eflags & MAP_ENTRY_WRITECNT) { vm_pager_update_writecount(object, new_entry->start, new_entry->end); } vm_map_entry_set_vnode_text(new_entry, true); /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; /* * Copied entry is COW over the old object. */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry, fork_charge); vm_map_entry_set_vnode_text(new_entry, true); break; case VM_INHERIT_ZERO: /* * Create a new anonymous mapping entry modelled from * the old one. */ new_entry = vm_map_entry_create(new_map); memset(new_entry, 0, sizeof(*new_entry)); new_entry->start = old_entry->start; new_entry->end = old_entry->end; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC); new_entry->protection = old_entry->protection; new_entry->max_protection = old_entry->max_protection; new_entry->inheritance = VM_INHERIT_ZERO; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); new_entry->cred = curthread->td_ucred; crhold(new_entry->cred); *fork_charge += (new_entry->end - new_entry->start); break; } old_entry = vm_map_entry_succ(old_entry); } /* * Use inlined vm_map_unlock() to postpone handling the deferred * map entries, which cannot be done until both old_map and * new_map locks are released. */ sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); return (vm2); } /* * Create a process's stack for exec_new_vmspace(). This function is never * asked to wire the newly created stack. */ int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_size_t growsize, init_ssize; rlim_t vmemlim; int rv; MPASS((map->flags & MAP_WIREFUTURE) == 0); growsize = sgrowsiz; init_ssize = (max_ssize < growsize) ? max_ssize : growsize; vm_map_lock(map); vmemlim = lim_cur(curthread, RLIMIT_VMEM); /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { rv = KERN_NO_SPACE; goto out; } rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, max, cow); out: vm_map_unlock(map); return (rv); } static int stack_guard_page = 1; SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, &stack_guard_page, 0, "Specifies the number of guard pages for a stack that grows"); static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, gap_bot, gap_top, top; vm_size_t init_ssize, sgp; int orient, rv; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. */ orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); KASSERT(orient != 0, ("No stack grow direction")); KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), ("bi-dir stack")); if (addrbos < vm_map_min(map) || addrbos + max_ssize > vm_map_max(map) || addrbos + max_ssize <= addrbos) return (KERN_INVALID_ADDRESS); sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : (vm_size_t)stack_guard_page * PAGE_SIZE; if (sgp >= max_ssize) return (KERN_INVALID_ARGUMENT); init_ssize = growsize; if (max_ssize < init_ssize + sgp) init_ssize = max_ssize - sgp; /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) return (KERN_NO_SPACE); /* * If we can't accommodate max_ssize in the current mapping, no go. */ if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize) return (KERN_NO_SPACE); /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) { bot = addrbos + max_ssize - init_ssize; top = bot + init_ssize; gap_bot = addrbos; gap_top = bot; } else /* if (orient == MAP_STACK_GROWS_UP) */ { bot = addrbos; top = bot + init_ssize; gap_bot = top; gap_top = addrbos + max_ssize; } rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); if (rv != KERN_SUCCESS) return (rv); new_entry = vm_map_entry_succ(prev_entry); KASSERT(new_entry->end == top || new_entry->start == bot, ("Bad entry start/end for new stack entry")); KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, ("new entry lacks MAP_ENTRY_GROWS_UP")); if (gap_bot == gap_top) return (KERN_SUCCESS); rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); if (rv == KERN_SUCCESS) { /* * Gap can never successfully handle a fault, so * read-ahead logic is never used for it. Re-use * next_read of the gap entry to store * stack_guard_page for vm_map_growstack(). */ if (orient == MAP_STACK_GROWS_DOWN) vm_map_entry_pred(new_entry)->next_read = sgp; else vm_map_entry_succ(new_entry)->next_read = sgp; } else { (void)vm_map_delete(map, bot, top); } return (rv); } /* * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we * successfully grow the stack. */ static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) { vm_map_entry_t stack_entry; struct proc *p; struct vmspace *vm; struct ucred *cred; vm_offset_t gap_end, gap_start, grow_start; vm_size_t grow_amount, guard, max_grow; rlim_t lmemlim, stacklim, vmemlim; int rv, rv1; bool gap_deleted, grow_down, is_procstack; #ifdef notyet uint64_t limit; #endif #ifdef RACCT int error; #endif p = curproc; vm = p->p_vmspace; /* * Disallow stack growth when the access is performed by a * debugger or AIO daemon. The reason is that the wrong * resource limits are applied. */ if (p != initproc && (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)) return (KERN_FAILURE); MPASS(!map->system_map); lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); stacklim = lim_cur(curthread, RLIMIT_STACK); vmemlim = lim_cur(curthread, RLIMIT_VMEM); retry: /* If addr is not in a hole for a stack grow area, no need to grow. */ if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) return (KERN_FAILURE); if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) return (KERN_SUCCESS); if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { stack_entry = vm_map_entry_succ(gap_entry); if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || stack_entry->start != gap_entry->end) return (KERN_FAILURE); grow_amount = round_page(stack_entry->start - addr); grow_down = true; } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { stack_entry = vm_map_entry_pred(gap_entry); if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || stack_entry->end != gap_entry->start) return (KERN_FAILURE); grow_amount = round_page(addr + 1 - stack_entry->end); grow_down = false; } else { return (KERN_FAILURE); } guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : gap_entry->next_read; max_grow = gap_entry->end - gap_entry->start; if (guard > max_grow) return (KERN_NO_SPACE); max_grow -= guard; if (grow_amount > max_grow) return (KERN_NO_SPACE); /* * If this is the main process stack, see if we're over the stack * limit. */ is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && addr < (vm_offset_t)p->p_sysent->sv_usrstack; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) return (KERN_NO_SPACE); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (is_procstack && racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { PROC_UNLOCK(p); return (KERN_NO_SPACE); } PROC_UNLOCK(p); } #endif grow_amount = roundup(grow_amount, sgrowsiz); if (grow_amount > max_grow) grow_amount = max_grow; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } #ifdef notyet PROC_LOCK(p); limit = racct_get_available(p, RACCT_STACK); PROC_UNLOCK(p); if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) grow_amount = limit - ctob(vm->vm_ssize); #endif if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif if (vm_map_lock_upgrade(map)) { gap_entry = NULL; vm_map_lock_read(map); goto retry; } if (grow_down) { grow_start = gap_entry->end - grow_amount; if (gap_entry->start + grow_amount == gap_entry->end) { gap_start = gap_entry->start; gap_end = gap_entry->end; vm_map_entry_delete(map, gap_entry); gap_deleted = true; } else { MPASS(gap_entry->start < gap_entry->end - grow_amount); vm_map_entry_resize(map, gap_entry, -grow_amount); gap_deleted = false; } rv = vm_map_insert(map, NULL, 0, grow_start, grow_start + grow_amount, stack_entry->protection, stack_entry->max_protection, MAP_STACK_GROWS_DOWN); if (rv != KERN_SUCCESS) { if (gap_deleted) { rv1 = vm_map_insert(map, NULL, 0, gap_start, gap_end, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); MPASS(rv1 == KERN_SUCCESS); } else vm_map_entry_resize(map, gap_entry, grow_amount); } } else { grow_start = stack_entry->end; cred = stack_entry->cred; if (cred == NULL && stack_entry->object.vm_object != NULL) cred = stack_entry->object.vm_object->cred; if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) rv = KERN_NO_SPACE; /* Grow the underlying object if applicable. */ else if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), grow_amount, cred != NULL)) { if (gap_entry->start + grow_amount == gap_entry->end) { vm_map_entry_delete(map, gap_entry); vm_map_entry_resize(map, stack_entry, grow_amount); } else { gap_entry->start += grow_amount; stack_entry->end += grow_amount; } map->size += grow_amount; rv = KERN_SUCCESS; } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { rv = vm_map_wire_locked(map, grow_start, grow_start + grow_amount, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); } vm_map_lock_downgrade(map); out: #ifdef RACCT if (racct_enable && rv != KERN_SUCCESS) { PROC_LOCK(p); error = racct_set(p, RACCT_VMEM, map->size); KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); if (!old_mlock) { error = racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); } error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); KASSERT(error == 0, ("decreasing RACCT_STACK failed")); PROC_UNLOCK(p); } #endif return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ int vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, ("vmspace_exec recursed")); newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); if (newvmspace == NULL) return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); curthread->td_pflags |= TDP_EXECVMSPC; return (0); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ int vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; vm_ooffset_t fork_charge; if (oldvmspace->vm_refcnt == 1) return (0); fork_charge = 0; newvmspace = vmspace_fork(oldvmspace, &fork_charge); if (newvmspace == NULL) return (ENOMEM); if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { vmspace_free(newvmspace); return (ENOMEM); } PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); vmspace_free(oldvmspace); return (0); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; vm_object_t eobject; vm_size_t size; struct ucred *cred; RetryLookup: vm_map_lock_read(map); RetryLookupLocked: /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } entry = *out_entry; /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { fault_typea &= ~VM_PROT_FAULT_LOOKUP; if (prot == VM_PROT_NONE && map != kernel_map && (entry->eflags & MAP_ENTRY_GUARD) != 0 && (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | MAP_ENTRY_STACK_GAP_UP)) != 0 && vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) goto RetryLookupLocked; } fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), ("entry %p flags %x", entry, entry->eflags)); if ((fault_typea & VM_PROT_COPY) != 0 && (entry->max_protection & VM_PROT_WRITE) == 0 && (entry->eflags & MAP_ENTRY_COW) == 0) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; size = entry->end - entry->start; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if ((fault_type & VM_PROT_WRITE) != 0 || (fault_typea & VM_PROT_COPY) != 0) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; if (entry->cred == NULL) { /* * The debugger owner is charged for * the memory. */ cred = curthread->td_ucred; crhold(cred); if (!swap_reserve_by_cred(size, cred)) { crfree(cred); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } entry->cred = cred; } vm_object_shadow(&entry->object.vm_object, &entry->offset, size); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; eobject = entry->object.vm_object; if (eobject->cred != NULL) { /* * The object was not shadowed. */ swap_release_by_cred(size, entry->cred); crfree(entry->cred); entry->cred = NULL; } else if (entry->cred != NULL) { VM_OBJECT_WLOCK(eobject); eobject->cred = entry->cred; eobject->charge = size; VM_OBJECT_WUNLOCK(eobject); entry->cred = NULL; } vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; - entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, - atop(size)); + entry->object.vm_object = vm_object_allocate_anon(atop(size)); entry->offset = 0; if (entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = size; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_locked: * * Lookup the faulting address. A version of vm_map_lookup that returns * KERN_FAILURE instead of blocking on map lock or memory allocation. */ int vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) return (KERN_INVALID_ADDRESS); entry = *out_entry; /* * Fail if the entry refers to a submap. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) return (KERN_FAILURE); /* * Check whether this task is allowed to have this page. */ prot = entry->protection; fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * Fail if the entry was copy-on-write for a write fault. */ if (fault_type & VM_PROT_WRITE) return (KERN_FAILURE); /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } /* * Fail if an object should be created. */ if (entry->object.vm_object == NULL && !map->system_map) return (KERN_FAILURE); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } vm_offset_t vm_map_max_KBI(const struct vm_map *map) { return (vm_map_max(map)); } vm_offset_t vm_map_min_KBI(const struct vm_map *map) { return (vm_map_min(map)); } pmap_t vm_map_pmap_KBI(vm_map_t map) { return (map->pmap); } #ifdef INVARIANTS static void _vm_map_assert_consistent(vm_map_t map, int check) { vm_map_entry_t entry, prev; vm_size_t max_left, max_right; if (enable_vmmap_check != check) return; prev = &map->header; VM_MAP_ENTRY_FOREACH(entry, map) { KASSERT(prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)prev->end, (uintmax_t)entry->start)); KASSERT(entry->start < entry->end, ("map %p start = %jx, end = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->end)); KASSERT(entry->end <= vm_map_entry_succ(entry)->start, ("map %p end = %jx, next->start = %jx", map, (uintmax_t)entry->end, (uintmax_t)vm_map_entry_succ(entry)->start)); KASSERT(entry->left == NULL || entry->left->start < entry->start, ("map %p left->start = %jx, start = %jx", map, (uintmax_t)entry->left->start, (uintmax_t)entry->start)); KASSERT(entry->right == NULL || entry->start < entry->right->start, ("map %p start = %jx, right->start = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->right->start)); max_left = vm_map_entry_max_free_left(entry, vm_map_entry_pred(entry)); max_right = vm_map_entry_max_free_right(entry, vm_map_entry_succ(entry)); KASSERT(entry->max_free == MAX(max_left, max_right), ("map %p max = %jx, max_left = %jx, max_right = %jx", map, (uintmax_t)entry->max_free, (uintmax_t)max_left, (uintmax_t)max_right)); prev = entry; } KASSERT(prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)prev->end, (uintmax_t)entry->start)); } #endif #include "opt_ddb.h" #ifdef DDB #include #include static void vm_map_print(vm_map_t map) { vm_map_entry_t entry, prev; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); db_indent += 2; prev = &map->header; VM_MAP_ENTRY_FOREACH(entry, map) { db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", (void *)entry, (void *)entry->start, (void *)entry->end, entry->eflags); { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char) entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); if (prev == &map->header || prev->object.sub_map != entry->object.sub_map) { db_indent += 2; vm_map_print((vm_map_t)entry->object.sub_map); db_indent -= 2; } } else { if (entry->cred != NULL) db_printf(", ruid %d", entry->cred->cr_ruid); db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->object.vm_object && entry->object.vm_object->cred) db_printf(", obj ruid %d charge %jx", entry->object.vm_object->cred->cr_ruid, (uintmax_t)entry->object.vm_object->charge); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); if (prev == &map->header || prev->object.vm_object != entry->object.vm_object) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, 0, 0, (char *)0); db_indent -= 2; } } prev = entry; } db_indent -= 2; } DB_SHOW_COMMAND(map, map) { if (!have_addr) { db_printf("usage: show map \n"); return; } vm_map_print((vm_map_t)addr); } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = db_lookup_proc(addr); } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((vm_map_t)&p->p_vmspace->vm_map); } #endif /* DDB */ Index: head/sys/vm/vm_meter.c =================================================================== --- head/sys/vm/vm_meter.c (revision 354868) +++ head/sys/vm/vm_meter.c (revision 354869) @@ -1,568 +1,568 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vmmeter __read_mostly vm_cnt = { .v_swtch = EARLY_COUNTER, .v_trap = EARLY_COUNTER, .v_syscall = EARLY_COUNTER, .v_intr = EARLY_COUNTER, .v_soft = EARLY_COUNTER, .v_vm_faults = EARLY_COUNTER, .v_io_faults = EARLY_COUNTER, .v_cow_faults = EARLY_COUNTER, .v_cow_optim = EARLY_COUNTER, .v_zfod = EARLY_COUNTER, .v_ozfod = EARLY_COUNTER, .v_swapin = EARLY_COUNTER, .v_swapout = EARLY_COUNTER, .v_swappgsin = EARLY_COUNTER, .v_swappgsout = EARLY_COUNTER, .v_vnodein = EARLY_COUNTER, .v_vnodeout = EARLY_COUNTER, .v_vnodepgsin = EARLY_COUNTER, .v_vnodepgsout = EARLY_COUNTER, .v_intrans = EARLY_COUNTER, .v_reactivated = EARLY_COUNTER, .v_pdwakeups = EARLY_COUNTER, .v_pdpages = EARLY_COUNTER, .v_pdshortfalls = EARLY_COUNTER, .v_dfree = EARLY_COUNTER, .v_pfree = EARLY_COUNTER, .v_tfree = EARLY_COUNTER, .v_forks = EARLY_COUNTER, .v_vforks = EARLY_COUNTER, .v_rforks = EARLY_COUNTER, .v_kthreads = EARLY_COUNTER, .v_forkpages = EARLY_COUNTER, .v_vforkpages = EARLY_COUNTER, .v_rforkpages = EARLY_COUNTER, .v_kthreadpages = EARLY_COUNTER, .v_wire_count = EARLY_COUNTER, }; u_long __exclusive_cache_line vm_user_wire_count; static void vmcounter_startup(void) { counter_u64_t *cnt = (counter_u64_t *)&vm_cnt; COUNTER_ARRAY_ALLOC(cnt, VM_METER_NCOUNTERS, M_WAITOK); } SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_FIRST, vmcounter_startup, NULL); SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold"); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages"); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock"); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive"); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point"); static int sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 u_int32_t la[4]; if (req->flags & SCTL_MASK32) { la[0] = averunnable.ldavg[0]; la[1] = averunnable.ldavg[1]; la[2] = averunnable.ldavg[2]; la[3] = averunnable.fscale; return SYSCTL_OUT(req, la, sizeof(la)); } else #endif return SYSCTL_OUT(req, &averunnable, sizeof(averunnable)); } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg", "Machine loadaverage history"); /* * This function aims to determine if the object is mapped, * specifically, if it is referenced by a vm_map_entry. Because * objects occasionally acquire transient references that do not * represent a mapping, the method used here is inexact. However, it * has very low overhead and is good enough for the advisory * vm.vmtotal sysctl. */ static bool is_object_active(vm_object_t obj) { return (obj->ref_count > obj->shadow_count); } #if defined(COMPAT_FREEBSD11) struct vmtotal11 { int16_t t_rq; int16_t t_dw; int16_t t_pw; int16_t t_sl; int16_t t_sw; int32_t t_vm; int32_t t_avm; int32_t t_rm; int32_t t_arm; int32_t t_vmshr; int32_t t_avmshr; int32_t t_rmshr; int32_t t_armshr; int32_t t_free; }; #endif static int vmtotal(SYSCTL_HANDLER_ARGS) { struct vmtotal total; #if defined(COMPAT_FREEBSD11) struct vmtotal11 total11; #endif vm_object_t object; struct proc *p; struct thread *td; if (req->oldptr == NULL) { #if defined(COMPAT_FREEBSD11) if (curproc->p_osrel < P_OSREL_VMTOTAL64) return (SYSCTL_OUT(req, NULL, sizeof(total11))); #endif return (SYSCTL_OUT(req, NULL, sizeof(total))); } bzero(&total, sizeof(total)); /* * Calculate process statistics. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if ((p->p_flag & P_SYSTEM) != 0) continue; PROC_LOCK(p); if (p->p_state != PRS_NEW) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); switch (td->td_state) { case TDS_INHIBITED: if (TD_IS_SWAPPED(td)) total.t_sw++; else if (TD_IS_SLEEPING(td)) { if (td->td_priority <= PZERO) total.t_dw++; else total.t_sl++; } break; case TDS_CAN_RUN: total.t_sw++; break; case TDS_RUNQ: case TDS_RUNNING: total.t_rq++; break; default: break; } thread_unlock(td); } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Calculate object memory usage statistics. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { /* * Perform unsynchronized reads on the object. In * this case, the lack of synchronization should not * impair the accuracy of the reported statistics. */ if ((object->flags & OBJ_FICTITIOUS) != 0) { /* * Devices, like /dev/mem, will badly skew our totals. */ continue; } if (object->ref_count == 0) { /* * Also skip unreferenced objects, including * vnodes representing mounted file systems. */ continue; } if (object->ref_count == 1 && - (object->flags & OBJ_NOSPLIT) != 0) { + (object->flags & OBJ_ANON) == 0) { /* * Also skip otherwise unreferenced swap * objects backing tmpfs vnodes, and POSIX or * SysV shared memory. */ continue; } total.t_vm += object->size; total.t_rm += object->resident_page_count; if (is_object_active(object)) { total.t_avm += object->size; total.t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ total.t_vmshr += object->size; total.t_rmshr += object->resident_page_count; if (is_object_active(object)) { total.t_avmshr += object->size; total.t_armshr += object->resident_page_count; } } } mtx_unlock(&vm_object_list_mtx); total.t_pw = vm_wait_count(); total.t_free = vm_free_count(); #if defined(COMPAT_FREEBSD11) /* sysctl(8) allocates twice as much memory as reported by sysctl(3) */ if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen == sizeof(total11) || req->oldlen == 2 * sizeof(total11))) { bzero(&total11, sizeof(total11)); total11.t_rq = total.t_rq; total11.t_dw = total.t_dw; total11.t_pw = total.t_pw; total11.t_sl = total.t_sl; total11.t_sw = total.t_sw; total11.t_vm = total.t_vm; /* truncate */ total11.t_avm = total.t_avm; /* truncate */ total11.t_rm = total.t_rm; /* truncate */ total11.t_arm = total.t_arm; /* truncate */ total11.t_vmshr = total.t_vmshr; /* truncate */ total11.t_avmshr = total.t_avmshr; /* truncate */ total11.t_rmshr = total.t_rmshr; /* truncate */ total11.t_armshr = total.t_armshr; /* truncate */ total11.t_free = total.t_free; /* truncate */ return (SYSCTL_OUT(req, &total11, sizeof(total11))); } #endif return (SYSCTL_OUT(req, &total, sizeof(total))); } SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, vmtotal, "S,vmtotal", "System virtual memory statistics"); SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, "VM meter sys stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, "VM meter vm stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); static int sysctl_handle_vmstat(SYSCTL_HANDLER_ARGS) { uint64_t val; #ifdef COMPAT_FREEBSD11 uint32_t val32; #endif val = counter_u64_fetch(*(counter_u64_t *)arg1); #ifdef COMPAT_FREEBSD11 if (req->oldlen == sizeof(val32)) { val32 = val; /* truncate */ return (SYSCTL_OUT(req, &val32, sizeof(val32))); } #endif return (SYSCTL_OUT(req, &val, sizeof(val))); } #define VM_STATS(parent, var, descr) \ SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 | CTLFLAG_MPSAFE | \ CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) VM_STATS_SYS(v_swtch, "Context switches"); VM_STATS_SYS(v_trap, "Traps"); VM_STATS_SYS(v_syscall, "System calls"); VM_STATS_SYS(v_intr, "Device interrupts"); VM_STATS_SYS(v_soft, "Software interrupts"); VM_STATS_VM(v_vm_faults, "Address memory faults"); VM_STATS_VM(v_io_faults, "Page faults requiring I/O"); VM_STATS_VM(v_cow_faults, "Copy-on-write faults"); VM_STATS_VM(v_cow_optim, "Optimized COW faults"); VM_STATS_VM(v_zfod, "Pages zero-filled on demand"); VM_STATS_VM(v_ozfod, "Optimized zero fill pages"); VM_STATS_VM(v_swapin, "Swap pager pageins"); VM_STATS_VM(v_swapout, "Swap pager pageouts"); VM_STATS_VM(v_swappgsin, "Swap pages swapped in"); VM_STATS_VM(v_swappgsout, "Swap pages swapped out"); VM_STATS_VM(v_vnodein, "Vnode pager pageins"); VM_STATS_VM(v_vnodeout, "Vnode pager pageouts"); VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); VM_STATS_VM(v_tfree, "Total pages freed"); VM_STATS_VM(v_forks, "Number of fork() calls"); VM_STATS_VM(v_vforks, "Number of vfork() calls"); VM_STATS_VM(v_rforks, "Number of rfork() calls"); VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel"); VM_STATS_VM(v_forkpages, "VM pages affected by fork()"); VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()"); VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()"); VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel"); static int sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS) { u_int (*fn)(void); uint32_t val; fn = arg1; val = fn(); return (SYSCTL_OUT(req, &val, sizeof(val))); } #define VM_STATS_PROC(var, descr, fn) \ SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 | CTLFLAG_MPSAFE | \ CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr) #define VM_STATS_UINT(var, descr) \ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) #define VM_STATS_ULONG(var, descr) \ SYSCTL_ULONG(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) VM_STATS_UINT(v_page_size, "Page size in bytes"); VM_STATS_UINT(v_page_count, "Total number of pages in system"); VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock"); VM_STATS_UINT(v_free_target, "Pages desired free"); VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold"); VM_STATS_PROC(v_free_count, "Free pages", vm_free_count); VM_STATS_PROC(v_wire_count, "Wired pages", vm_wire_count); VM_STATS_PROC(v_active_count, "Active pages", vm_active_count); VM_STATS_UINT(v_inactive_target, "Desired inactive pages"); VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count); VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering", vm_laundry_count); VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code"); VM_STATS_UINT(v_free_severe, "Severe page depletion point"); SYSCTL_ULONG(_vm_stats_vm, OID_AUTO, v_user_wire_count, CTLFLAG_RD, &vm_user_wire_count, 0, "User-wired virtual memory"); #ifdef COMPAT_FREEBSD11 /* * Provide compatibility sysctls for the benefit of old utilities which exit * with an error if they cannot be found. */ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); #endif u_int vm_free_count(void) { u_int v; int i; v = 0; for (i = 0; i < vm_ndomains; i++) v += vm_dom[i].vmd_free_count; return (v); } static u_int vm_pagequeue_count(int pq) { u_int v; int i; v = 0; for (i = 0; i < vm_ndomains; i++) v += vm_dom[i].vmd_pagequeues[pq].pq_cnt; return (v); } u_int vm_active_count(void) { return (vm_pagequeue_count(PQ_ACTIVE)); } u_int vm_inactive_count(void) { return (vm_pagequeue_count(PQ_INACTIVE)); } u_int vm_laundry_count(void) { return (vm_pagequeue_count(PQ_LAUNDRY)); } static int sysctl_vm_pdpages(SYSCTL_HANDLER_ARGS) { struct vm_pagequeue *pq; uint64_t ret; int dom, i; ret = counter_u64_fetch(vm_cnt.v_pdpages); for (dom = 0; dom < vm_ndomains; dom++) for (i = 0; i < PQ_COUNT; i++) { pq = &VM_DOMAIN(dom)->vmd_pagequeues[i]; ret += pq->pq_pdpages; } return (SYSCTL_OUT(req, &ret, sizeof(ret))); } SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_vm_pdpages, "QU", "Pages analyzed by pagedaemon"); static void vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent) { struct sysctl_oid *oid; vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, vmd->vmd_name, CTLFLAG_RD, NULL, ""); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "stats", CTLFLAG_RD, NULL, ""); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0, "Free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0, "Active pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "actpdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_pdpages, 0, "Active pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0, "Inactive pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactpdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_pdpages, 0, "Inactive pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0, "laundry pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "laundpdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_pdpages, 0, "Laundry pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0, "Unswappable pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswppdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_pdpages, 0, "Unswappable pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0, "Target inactive pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0, "Target free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0, "Reserved free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0, "Minimum free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0, "Severe free pages"); } static void vm_stats_init(void *arg __unused) { struct sysctl_oid *oid; int i; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO, "domain", CTLFLAG_RD, NULL, ""); for (i = 0; i < vm_ndomains; i++) vm_domain_stats_init(VM_DOMAIN(i), oid); } SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL); Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 354868) +++ head/sys/vm/vm_object.c (revision 354869) @@ -1,2647 +1,2660 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static counter_u64_t object_collapses = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, "VM object collapses"); static counter_u64_t object_bypasses = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, "VM object bypasses"); static void counter_startup(void) { object_collapses = counter_u64_alloc(M_WAITOK); object_bypasses = counter_u64_alloc(M_WAITOK); } SYSINIT(object_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(REFCOUNT_COUNT(object->paging_in_progress) == 0, ("object %p paging_in_progress = %d", object, REFCOUNT_COUNT(object->paging_in_progress))); KASSERT(object->busy == 0, ("object %p busy = %d", object, object->busy)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; vm_radix_init(&object->rtree); refcount_init(&object->ref_count, 0); refcount_init(&object->paging_in_progress, 0); refcount_init(&object->busy, 0); object->resident_page_count = 0; object->shadow_count = 0; object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void -_vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) +_vm_object_allocate(objtype_t type, vm_pindex_t size, u_short flags, + vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; if (type == OBJT_SWAP) pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff() iteration over object_list * sees up to date type and pctrie head if it observed * non-dead object. */ atomic_thread_fence_rel(); - switch (type) { - case OBJT_DEAD: - panic("_vm_object_allocate: can't create OBJT_DEAD"); - case OBJT_DEFAULT: - case OBJT_SWAP: - object->flags = OBJ_ONEMAPPING; - break; - case OBJT_DEVICE: - case OBJT_SG: - object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; - break; - case OBJT_MGTDEVICE: - object->flags = OBJ_FICTITIOUS; - break; - case OBJT_PHYS: - object->flags = OBJ_UNMANAGED; - break; - case OBJT_VNODE: - object->flags = 0; - break; - default: - panic("_vm_object_allocate: type %d is undefined", type); - } + object->pg_color = 0; + object->flags = flags; object->size = size; object->domain.dr_policy = NULL; object->generation = 1; object->cleangeneration = 1; refcount_init(&object->ref_count, 1); object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - - VM_MIN_KERNEL_ADDRESS), kernel_object); + VM_MIN_KERNEL_ADDRESS), OBJ_UNMANAGED, kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { refcount_acquiren(&object->paging_in_progress, i); } void vm_object_pip_wakeup(vm_object_t object) { refcount_release(&object->paging_in_progress); } void vm_object_pip_wakeupn(vm_object_t object, short i) { refcount_releasen(&object->paging_in_progress, i); } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (REFCOUNT_COUNT(object->paging_in_progress) > 0) { VM_OBJECT_WUNLOCK(object); refcount_wait(&object->paging_in_progress, waitid, PVM); VM_OBJECT_WLOCK(object); } } void vm_object_pip_wait_unlocked(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_UNLOCKED(object); while (REFCOUNT_COUNT(object->paging_in_progress) > 0) refcount_wait(&object->paging_in_progress, waitid, PVM); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; + u_short flags; + switch (type) { + case OBJT_DEAD: + panic("vm_object_allocate: can't create OBJT_DEAD"); + case OBJT_DEFAULT: + case OBJT_SWAP: + flags = OBJ_COLORED; + break; + case OBJT_DEVICE: + case OBJT_SG: + flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; + break; + case OBJT_MGTDEVICE: + flags = OBJ_FICTITIOUS; + break; + case OBJT_PHYS: + flags = OBJ_UNMANAGED; + break; + case OBJT_VNODE: + flags = 0; + break; + default: + panic("vm_object_allocate: type %d is undefined", type); + } object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); - _vm_object_allocate(type, size, object); + _vm_object_allocate(type, size, flags, object); + return (object); } +/* + * vm_object_allocate_anon: + * + * Returns a new default object of the given size and marked as + * anonymous memory for special split/collapse handling. Color + * to be initialized by the caller. + */ +vm_object_t +vm_object_allocate_anon(vm_pindex_t size) +{ + vm_object_t object; + object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); + _vm_object_allocate(OBJT_DEFAULT, size, OBJ_ANON | OBJ_ONEMAPPING, + object); + + return (object); +} + + /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_RLOCK(object); vm_object_reference_locked(object); VM_OBJECT_RUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); if (refcount_release(&object->ref_count) && !umtx_shm_vnobj_persistent) umtx_shm_object_terminated(object); VM_OBJECT_RUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; bool released; while (object != NULL) { VM_OBJECT_RLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. A ref count * of 1 may be a special case depending on the shadow count * being 0 or 1. These cases require a write lock on the * object. */ - released = refcount_release_if_gt(&object->ref_count, 2); + if ((object->flags & OBJ_ANON) == 0) + released = refcount_release_if_gt(&object->ref_count, 1); + else + released = refcount_release_if_gt(&object->ref_count, 2); VM_OBJECT_RUNLOCK(object); if (released) return; VM_OBJECT_WLOCK(object); KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); refcount_release(&object->ref_count); if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->shadow_count == 0 && object->handle == NULL && - (object->type == OBJT_DEFAULT || - (object->type == OBJT_SWAP && - (object->flags & OBJ_TMPFS_NODE) == 0))) { + (object->flags & OBJ_ANON) != 0) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && - (object->type == OBJT_DEFAULT || - object->type == OBJT_SWAP)) { + (object->flags & OBJ_ANON) != 0) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ refcount_acquire(&object->ref_count); VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ - if ((robject->flags & OBJ_DEAD) == 0 && - (robject->handle == NULL) && - (robject->type == OBJT_DEFAULT || - robject->type == OBJT_SWAP)) { + if ((robject->flags & + (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON && + robject->handle == NULL) { refcount_acquire(&robject->ref_count); retry: if (REFCOUNT_COUNT(robject->paging_in_progress) > 0) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (REFCOUNT_COUNT(object->paging_in_progress) > 0) { VM_OBJECT_WUNLOCK(robject); VM_OBJECT_WUNLOCK(object); refcount_wait( &object->paging_in_progress, "objde2", PVM); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) { vm_object_set_flag(object, OBJ_DEAD); vm_object_terminate(object); } else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate_pages removes any remaining pageable pages * from the object and resets the object to an empty state. */ static void vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); KASSERT(p->object == object && (p->ref_count & VPRC_OBJREF) != 0, ("vm_object_terminate_pages: page %p is inconsistent", p)); p->object = NULL; if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) { VM_CNT_INC(v_pfree); vm_page_free(p); } } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("terminating non-dead obj %p", object)); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!REFCOUNT_COUNT(object->paging_in_progress), ("vm_object_terminate: pageout in progress")); KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean) { vm_page_assert_busied(p); /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->aflags & PGA_NOSYNC) != 0) { *allclean = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PGA_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t eio, res, allclean; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE || !vm_object_mightbedirty(object) || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); allclean = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (vm_page_none_valid(p)) continue; if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) { if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &allclean)) { vm_page_xunbusy(p); continue; } n = vm_object_page_collect_flush(object, p, pagerflags, flags, &allclean, &eio); if (eio) { res = FALSE; allclean = FALSE; } if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; allclean = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (allclean) object->cleangeneration = curgeneration; return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); vm_page_assert_xbusied(p); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && vm_object_mightbedirty(object) != 0 && ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); - return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) && - (object->flags & OBJ_ONEMAPPING) != 0); + return ((object->flags & (OBJ_ONEMAPPING | OBJ_ANON)) == + (OBJ_ONEMAPPING | OBJ_ANON)); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE && object->type == OBJT_SWAP) swap_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. The page * can not be invalidated while the object lock is held. */ if (!vm_page_all_valid(tm) || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_tryxbusy(tm) == 0) { if (object != tobject) VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } vm_page_lock(tm); vm_page_advise(tm, advice); vm_page_unlock(tm); vm_page_xunbusy(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. + * + * If we hold the only reference we can guarantee that it won't + * increase while we have the map locked. Otherwise the race is + * harmless and we will end up with an extra shadow object that + * will be collapsed later. */ - if (source != NULL) { - VM_OBJECT_RLOCK(source); - if (source->ref_count == 1 && - source->handle == NULL && - (source->type == OBJT_DEFAULT || - source->type == OBJT_SWAP)) { - VM_OBJECT_RUNLOCK(source); - return; - } - VM_OBJECT_RUNLOCK(source); - } + if (source != NULL && source->ref_count == 1 && + source->handle == NULL && (source->flags & OBJ_ANON) != 0) + return; /* * Allocate a new object with the given length. */ - result = vm_object_allocate(OBJT_DEFAULT, atop(length)); + result = vm_object_allocate_anon(atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); result->domain = source->domain; LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; - if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) + if ((orig_object->flags & OBJ_ANON) == 0) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ - new_object = vm_object_allocate(OBJT_DEFAULT, size); + new_object = vm_object_allocate_anon(size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); new_object->domain = orig_object->domain; source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_tryxbusy(m) == 0) { VM_OBJECT_WUNLOCK(new_object); vm_page_sleep_if_busy(m, "spltwt"); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { vm_page_xunbusy(m); VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type != OBJT_SWAP) vm_page_xunbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); /* The page is only NULL when rename fails. */ if (p == NULL) { vm_radix_wait(); } else { if (p->object == object) VM_OBJECT_WUNLOCK(backing_object); else VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmocol", false); } VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; - if (backing_object->type != OBJT_DEFAULT && - backing_object->type != OBJT_SWAP) + if ((backing_object->flags & OBJ_ANON) == 0) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); /* * The valid check here is stable due to object lock being * required to clear valid and initiate paging. */ if ((pp == NULL || vm_page_none_valid(pp)) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_tryxbusy(p) == 0) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); else vm_page_xunbusy(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_tryxbusy(pp) == 0) { vm_page_xunbusy(p); /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || !vm_page_none_valid(pp), ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); else vm_page_xunbusy(p); if (pp != NULL) vm_page_xunbusy(pp); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { vm_page_xunbusy(p); if (pp != NULL) vm_page_xunbusy(pp); next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif vm_page_xunbusy(p); } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ + if ((backing_object->flags & OBJ_ANON) == 0) + break; VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || - (backing_object->type != OBJT_DEFAULT && - backing_object->type != OBJT_SWAP) || - (backing_object->flags & (OBJ_DEAD | OBJ_NOSPLIT)) != 0 || + (backing_object->flags & OBJ_DEAD) != 0 || object->handle != NULL || - (object->type != OBJT_DEFAULT && - object->type != OBJT_SWAP) || - (object->flags & OBJ_DEAD)) { + (object->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(backing_object); break; } if (REFCOUNT_COUNT(object->paging_in_progress) > 0 || REFCOUNT_COUNT(backing_object->paging_in_progress) > 0) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { vm_object_pip_add(object, 1); vm_object_pip_add(backing_object, 1); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); vm_object_pip_wakeup(object); counter_u64_add(object_collapses, 1); } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ refcount_release(&backing_object->ref_count); VM_OBJECT_WUNLOCK(backing_object); counter_u64_add(object_bypasses, 1); } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ if (vm_page_tryxbusy(p) == 0) { vm_page_sleep_if_busy(p, "vmopar"); goto again; } if (vm_page_wired(p)) { wired: if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { vm_page_invalid(p); vm_page_undirty(p); } vm_page_xunbusy(p); continue; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && !vm_page_none_valid(p)) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) goto wired; if (p->dirty != 0) { vm_page_xunbusy(p); continue; } } if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_all(p)) goto wired; vm_page_free(p); } vm_object_pip_wakeup(object); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx; vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); vm_page_change_lock(p, &mtx); vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL); if (rv != VM_PAGER_OK) break; /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); - VM_OBJECT_WLOCK(prev_object); - if ((prev_object->type != OBJT_DEFAULT && - prev_object->type != OBJT_SWAP) || - (prev_object->flags & OBJ_NOSPLIT) != 0) { - VM_OBJECT_WUNLOCK(prev_object); + if ((prev_object->flags & OBJ_ANON) == 0) return (FALSE); - } + VM_OBJECT_WLOCK(prev_object); /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if (prev_object->ref_count > 1 && prev_object->size != next_pindex && (prev_object->flags & OBJ_ONEMAPPING) == 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); /* Only set for vnodes & tmpfs */ if (object->type != OBJT_VNODE && (object->flags & OBJ_TMPFS_NODE) == 0) return; atomic_add_int(&object->generation, 1); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } if (vm_page_trysbusy(tm) == 0) { for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; if (tm->object != tobject) VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } vm_page_busy_sleep(tm, "unwbo", true); goto again; } vm_page_unwire(tm, queue); vm_page_sunbusy(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } } /* * Return the vnode for the given object, or NULL if none exists. * For tmpfs objects, the function may return NULL if there is * no vnode allocated at the time of the call. */ struct vnode * vm_object_vnode(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) { vp = object->handle; KASSERT(vp != NULL, ("%s: OBJT_VNODE has no vnode", __func__)); } else if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; KASSERT(vp != NULL, ("%s: OBJT_TMPFS has no vnode", __func__)); } else { vp = NULL; } return (vp); } /* * Busy the vm object. This prevents new pages belonging to the object from * becoming busy. Existing pages persist as busy. Callers are responsible * for checking page state before proceeding. */ void vm_object_busy(vm_object_t obj) { VM_OBJECT_ASSERT_LOCKED(obj); refcount_acquire(&obj->busy); /* The fence is required to order loads of page busy. */ atomic_thread_fence_acq_rel(); } void vm_object_unbusy(vm_object_t obj) { refcount_release(&obj->busy); } void vm_object_busy_wait(vm_object_t obj, const char *wmesg) { VM_OBJECT_ASSERT_UNLOCKED(obj); if (obj->busy) refcount_sleep(&obj->busy, wmesg, PVM); } /* * Return the kvme type of the given object. * If vpp is not NULL, set it to the object's vm_object_vnode() or NULL. */ int vm_object_kvme_type(vm_object_t object, struct vnode **vpp) { VM_OBJECT_ASSERT_LOCKED(object); if (vpp != NULL) *vpp = vm_object_vnode(object); switch (object->type) { case OBJT_DEFAULT: return (KVME_TYPE_DEFAULT); case OBJT_VNODE: return (KVME_TYPE_VNODE); case OBJT_SWAP: if ((object->flags & OBJ_TMPFS_NODE) != 0) return (KVME_TYPE_VNODE); return (KVME_TYPE_SWAP); case OBJT_DEVICE: return (KVME_TYPE_DEVICE); case OBJT_PHYS: return (KVME_TYPE_PHYS); case OBJT_DEAD: return (KVME_TYPE_DEAD); case OBJT_SG: return (KVME_TYPE_SG); case OBJT_MGTDEVICE: return (KVME_TYPE_MGTDEVICE); default: return (KVME_TYPE_UNKNOWN); } } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = obj->shadow_count; kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->queue == PQ_ACTIVE) kvo->kvo_active++; else if (m->queue == PQ_INACTIVE) kvo->kvo_inactive++; } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; kvo->kvo_type = vm_object_kvme_type(obj, &vp); if (vp != NULL) vref(vp); VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (map == 0) return 0; if (entry == 0) { VM_MAP_ENTRY_FOREACH(tmpe, map) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; VM_MAP_ENTRY_FOREACH(tmpe, tmpm) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_object.h =================================================================== --- head/sys/vm/vm_object.h (revision 354868) +++ head/sys/vm/vm_object.h (revision 354869) @@ -1,373 +1,374 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include #include #include #include #include /* * Types defined: * * vm_object_t Virtual memory object. * * List of locks * (a) atomic * (c) const until freed * (o) per-object lock * (f) free pages queue mutex * */ #ifndef VM_PAGE_HAVE_PGLIST TAILQ_HEAD(pglist, vm_page); #define VM_PAGE_HAVE_PGLIST #endif struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ struct pglist memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ struct domainset_ref domain; /* NUMA policy. */ volatile int generation; /* generation ID */ int cleangeneration; /* Generation at clean time */ volatile u_int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ volatile u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ volatile u_int busy; /* (a) object is busy, disallow page busy. */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; vm_ooffset_t writemappings; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; struct cdev_pager_ops *ops; struct cdev *dev; } devp; /* * SG pager * * sgp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) sgp_pglist; } sgp; /* * Swap pager * * swp_tmpfs - back-pointer to the tmpfs vnode, * if any, which uses the vm object * as backing store. The handle * cannot be reused for linking, * because the vnode can be * reclaimed and recreated, making * the handle changed and hash-chain * invalid. * * swp_blks - pc-trie of the allocated swap blocks. * */ struct { void *swp_tmpfs; struct pctrie swp_blks; vm_ooffset_t writemappings; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; void *umtx_data; }; /* * Flags */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ #define OBJ_POPULATE 0x0004 /* pager implements populate() */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ -#define OBJ_NOSPLIT 0x0010 /* dont split this object */ +#define OBJ_ANON 0x0010 /* (c) contains anonymous memory */ #define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ #define OBJ_SIZEVNLOCK 0x0040 /* lock vnode to check obj size */ #define OBJ_PG_DTOR 0x0080 /* dont reset object, leave that for dtor */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ /* * Helpers to perform conversion between vm_object page indexes and offsets. * IDX_TO_OFF() converts an index into an offset. * OFF_TO_IDX() converts an offset into an index. * OBJ_MAX_SIZE specifies the maximum page index corresponding to the * maximum unsigned offset. */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #define OBJ_MAX_SIZE (OFF_TO_IDX(UINT64_MAX) + 1) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if PGA_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). */ #define OBJPR_CLEANONLY 0x1 /* Don't remove dirty pages. */ #define OBJPR_NOTMAPPED 0x2 /* Don't unmap pages. */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; /* kernel and kmem are aliased for backwards KPI compat. */ #define kernel_object (&kernel_object_store) #define kmem_object (&kernel_object_store) #define VM_OBJECT_ASSERT_LOCKED(object) \ rw_assert(&(object)->lock, RA_LOCKED) #define VM_OBJECT_ASSERT_RLOCKED(object) \ rw_assert(&(object)->lock, RA_RLOCKED) #define VM_OBJECT_ASSERT_WLOCKED(object) \ rw_assert(&(object)->lock, RA_WLOCKED) #define VM_OBJECT_ASSERT_UNLOCKED(object) \ rw_assert(&(object)->lock, RA_UNLOCKED) #define VM_OBJECT_LOCK_DOWNGRADE(object) \ rw_downgrade(&(object)->lock) #define VM_OBJECT_RLOCK(object) \ rw_rlock(&(object)->lock) #define VM_OBJECT_RUNLOCK(object) \ rw_runlock(&(object)->lock) #define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo)) #define VM_OBJECT_TRYRLOCK(object) \ rw_try_rlock(&(object)->lock) #define VM_OBJECT_TRYWLOCK(object) \ rw_try_wlock(&(object)->lock) #define VM_OBJECT_TRYUPGRADE(object) \ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) #define VM_OBJECT_WOWNED(object) \ rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) #define VM_OBJECT_DROP(object) \ lock_class_rw.lc_unlock(&(object)->lock.lock_object) #define VM_OBJECT_PICKUP(object, state) \ lock_class_rw.lc_lock(&(object)->lock.lock_object, (state)) struct vnode; /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } /* * Conditionally set the object's color, which (1) enables the allocation * of physical memory reservations for anonymous objects and larger-than- * superpage-sized named objects and (2) determines the first page offset * within the object at which a reservation may be allocated. In other * words, the color determines the alignment of the object with respect * to the largest superpage boundary. When mapping named objects, like * files or POSIX shared memory objects, the color should be set to zero * before a virtual address is selected for the mapping. In contrast, * for anonymous objects, the color may be set after the virtual address * is selected. * * The object must be locked. */ static __inline void vm_object_color(vm_object_t object, u_short color) { if ((object->flags & OBJ_COLORED) == 0) { object->pg_color = color; object->flags |= OBJ_COLORED; } } static __inline bool vm_object_reserv(vm_object_t object) { if (object != NULL && (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) == OBJ_COLORED) { return (true); } return (false); } static __inline bool vm_object_mightbedirty(vm_object_t object) { return (object->type == OBJT_VNODE && object->generation != object->cleangeneration); } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); void vm_object_pip_wait_unlocked(vm_object_t object, char *waitid); void vm_object_busy(vm_object_t object); void vm_object_unbusy(vm_object_t object); void vm_object_busy_wait(vm_object_t object, const char *wmesg); static inline bool vm_object_busied(vm_object_t object) { return (object->busy != 0); } #define VM_OBJECT_ASSERT_BUSY(object) MPASS(vm_object_busied((object))) void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); extern int umtx_shm_vnobj_persistent; vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); +vm_object_t vm_object_allocate_anon(vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); int vm_object_kvme_type(vm_object_t object, struct vnode **vpp); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_object_print(long addr, boolean_t have_addr, long count, char *modif); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ Index: head/sys/vm/vm_reserv.c =================================================================== --- head/sys/vm/vm_reserv.c (revision 354868) +++ head/sys/vm/vm_reserv.c (revision 354869) @@ -1,1429 +1,1419 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2011 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Superpage reservation management module * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The reservation system supports the speculative allocation of large physical * pages ("superpages"). Speculative allocation enables the fully automatic * utilization of superpages by the virtual memory system. In other words, no * programmatic directives are required to use superpages. */ #if VM_NRESERVLEVEL > 0 #ifndef VM_LEVEL_0_ORDER_MAX #define VM_LEVEL_0_ORDER_MAX VM_LEVEL_0_ORDER #endif /* * The number of small pages that are contained in a level 0 reservation */ #define VM_LEVEL_0_NPAGES (1 << VM_LEVEL_0_ORDER) #define VM_LEVEL_0_NPAGES_MAX (1 << VM_LEVEL_0_ORDER_MAX) /* * The number of bits by which a physical address is shifted to obtain the * reservation number */ #define VM_LEVEL_0_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) /* * The size of a level 0 reservation in bytes */ #define VM_LEVEL_0_SIZE (1 << VM_LEVEL_0_SHIFT) /* * Computes the index of the small page underlying the given (object, pindex) * within the reservation's array of small pages. */ #define VM_RESERV_INDEX(object, pindex) \ (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) /* * The size of a population map entry */ typedef u_long popmap_t; /* * The number of bits in a population map entry */ #define NBPOPMAP (NBBY * sizeof(popmap_t)) /* * The number of population map entries in a reservation */ #define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) #define NPOPMAP_MAX howmany(VM_LEVEL_0_NPAGES_MAX, NBPOPMAP) /* * Number of elapsed ticks before we update the LRU queue position. Used * to reduce contention and churn on the list. */ #define PARTPOPSLOP 1 /* * Clear a bit in the population map. */ static __inline void popmap_clear(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); } /* * Set a bit in the population map. */ static __inline void popmap_set(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); } /* * Is a bit in the population map clear? */ static __inline boolean_t popmap_is_clear(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); } /* * Is a bit in the population map set? */ static __inline boolean_t popmap_is_set(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); } /* * The reservation structure * * A reservation structure is constructed whenever a large physical page is * speculatively allocated to an object. The reservation provides the small * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets * within that object. The reservation's "popcnt" tracks the number of these * small physical pages that are in use at any given time. When and if the * reservation is not fully utilized, it appears in the queue of partially * populated reservations. The reservation always appears on the containing * object's list of reservations. * * A partially populated reservation can be broken and reclaimed at any time. * * r - vm_reserv_lock * d - vm_reserv_domain_lock * o - vm_reserv_object_lock * c - constant after boot */ struct vm_reserv { struct mtx lock; /* reservation lock. */ TAILQ_ENTRY(vm_reserv) partpopq; /* (d, r) per-domain queue. */ LIST_ENTRY(vm_reserv) objq; /* (o, r) object queue */ vm_object_t object; /* (o, r) containing object */ vm_pindex_t pindex; /* (o, r) offset in object */ vm_page_t pages; /* (c) first page */ uint16_t popcnt; /* (r) # of pages in use */ uint8_t domain; /* (c) NUMA domain. */ char inpartpopq; /* (d, r) */ int lasttick; /* (r) last pop update tick. */ popmap_t popmap[NPOPMAP_MAX]; /* (r) bit vector, used pages */ }; #define vm_reserv_lockptr(rv) (&(rv)->lock) #define vm_reserv_assert_locked(rv) \ mtx_assert(vm_reserv_lockptr(rv), MA_OWNED) #define vm_reserv_lock(rv) mtx_lock(vm_reserv_lockptr(rv)) #define vm_reserv_trylock(rv) mtx_trylock(vm_reserv_lockptr(rv)) #define vm_reserv_unlock(rv) mtx_unlock(vm_reserv_lockptr(rv)) /* * The reservation array * * This array is analoguous in function to vm_page_array. It differs in the * respect that it may contain a greater number of useful reservation * structures than there are (physical) superpages. These "invalid" * reservation structures exist to trade-off space for time in the * implementation of vm_reserv_from_page(). Invalid reservation structures are * distinguishable from "valid" reservation structures by inspecting the * reservation's "pages" field. Invalid reservation structures have a NULL * "pages" field. * * vm_reserv_from_page() maps a small (physical) page to an element of this * array by computing a physical reservation number from the page's physical * address. The physical reservation number is used as the array index. * * An "active" reservation is a valid reservation structure that has a non-NULL * "object" field and a non-zero "popcnt" field. In other words, every active * reservation belongs to a particular object. Moreover, every active * reservation has an entry in the containing object's list of reservations. */ static vm_reserv_t vm_reserv_array; /* * The per-domain partially populated reservation queues * * These queues enable the fast recovery of an unused free small page from a * partially populated reservation. The reservation at the head of a queue * is the least recently changed, partially populated reservation. * * Access to this queue is synchronized by the per-domain reservation lock. */ struct vm_reserv_domain { struct mtx lock; TAILQ_HEAD(, vm_reserv) partpop; } __aligned(CACHE_LINE_SIZE); static struct vm_reserv_domain vm_rvd[MAXMEMDOM]; #define vm_reserv_domain_lockptr(d) (&vm_rvd[(d)].lock) #define vm_reserv_domain_lock(d) mtx_lock(vm_reserv_domain_lockptr(d)) #define vm_reserv_domain_unlock(d) mtx_unlock(vm_reserv_domain_lockptr(d)) static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info"); static counter_u64_t vm_reserv_broken = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD, &vm_reserv_broken, "Cumulative number of broken reservations"); static counter_u64_t vm_reserv_freed = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, &vm_reserv_freed, "Cumulative number of freed reservations"); static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues"); static counter_u64_t vm_reserv_reclaimed = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, &vm_reserv_reclaimed, "Cumulative number of reclaimed reservations"); /* * The object lock pool is used to synchronize the rvq. We can not use a * pool mutex because it is required before malloc works. * * The "hash" function could be made faster without divide and modulo. */ #define VM_RESERV_OBJ_LOCK_COUNT MAXCPU struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT]; #define vm_reserv_object_lock_idx(object) \ (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT) #define vm_reserv_object_lock_ptr(object) \ &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))] #define vm_reserv_object_lock(object) \ mtx_lock(vm_reserv_object_lock_ptr((object))) #define vm_reserv_object_unlock(object) \ mtx_unlock(vm_reserv_object_lock_ptr((object))) static void vm_reserv_break(vm_reserv_t rv); static void vm_reserv_depopulate(vm_reserv_t rv, int index); static vm_reserv_t vm_reserv_from_page(vm_page_t m); static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex); static void vm_reserv_populate(vm_reserv_t rv, int index); static void vm_reserv_reclaim(vm_reserv_t rv); /* * Returns the current number of full reservations. * * Since the number of full reservations is computed without acquiring any * locks, the returned value is inexact. */ static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) { vm_paddr_t paddr; struct vm_phys_seg *seg; vm_reserv_t rv; int fullpop, segind; fullpop = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + VM_LEVEL_0_SIZE <= seg->end) { rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; paddr += VM_LEVEL_0_SIZE; } } return (sysctl_handle_int(oidp, &fullpop, 0, req)); } /* * Describes the current state of the partially populated reservation queue. */ static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; vm_reserv_t rv; int counter, error, domain, level, unused_pages; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\nDOMAIN LEVEL SIZE NUMBER\n\n"); for (domain = 0; domain < vm_ndomains; domain++) { for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { counter = 0; unused_pages = 0; vm_reserv_domain_lock(domain); TAILQ_FOREACH(rv, &vm_rvd[domain].partpop, partpopq) { counter++; unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; } vm_reserv_domain_unlock(domain); sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n", domain, level, unused_pages * ((int)PAGE_SIZE / 1024), counter); } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Remove a reservation from the object's objq. */ static void vm_reserv_remove(vm_reserv_t rv) { vm_object_t object; vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); KASSERT(rv->object != NULL, ("vm_reserv_remove: reserv %p is free", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv)); object = rv->object; vm_reserv_object_lock(object); LIST_REMOVE(rv, objq); rv->object = NULL; vm_reserv_object_unlock(object); } /* * Insert a new reservation into the object's objq. */ static void vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex) { int i; vm_reserv_assert_locked(rv); CTR6(KTR_VM, "%s: rv %p(%p) object %p new %p popcnt %d", __FUNCTION__, rv, rv->pages, rv->object, object, rv->popcnt); KASSERT(rv->object == NULL, ("vm_reserv_insert: reserv %p isn't free", rv)); KASSERT(rv->popcnt == 0, ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_insert: reserv %p's popmap is corrupted", rv)); vm_reserv_object_lock(object); rv->pindex = pindex; rv->object = object; rv->lasttick = ticks; LIST_INSERT_HEAD(&object->rvq, rv, objq); vm_reserv_object_unlock(object); } /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the * reservation to the tail of the partially populated reservation queue if the * population count is non-zero. */ static void vm_reserv_depopulate(vm_reserv_t rv, int index) { struct vm_domain *vmd; vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); KASSERT(rv->object != NULL, ("vm_reserv_depopulate: reserv %p is free", rv)); KASSERT(popmap_is_set(rv->popmap, index), ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, index)); KASSERT(rv->popcnt > 0, ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); KASSERT(rv->domain < vm_ndomains, ("vm_reserv_depopulate: reserv %p's domain is corrupted %d", rv, rv->domain)); if (rv->popcnt == VM_LEVEL_0_NPAGES) { KASSERT(rv->pages->psind == 1, ("vm_reserv_depopulate: reserv %p is already demoted", rv)); rv->pages->psind = 0; } popmap_clear(rv->popmap, index); rv->popcnt--; if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP || rv->popcnt == 0) { vm_reserv_domain_lock(rv->domain); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); rv->inpartpopq = FALSE; } if (rv->popcnt != 0) { rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq); } vm_reserv_domain_unlock(rv->domain); rv->lasttick = ticks; } vmd = VM_DOMAIN(rv->domain); if (rv->popcnt == 0) { vm_reserv_remove(rv); vm_domain_free_lock(vmd); vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); vm_domain_free_unlock(vmd); counter_u64_add(vm_reserv_freed, 1); } vm_domain_freecnt_inc(vmd, 1); } /* * Returns the reservation to which the given page might belong. */ static __inline vm_reserv_t vm_reserv_from_page(vm_page_t m) { return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]); } /* * Returns an existing reservation or NULL and initialized successor pointer. */ static vm_reserv_t vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred, vm_page_t *msuccp) { vm_reserv_t rv; vm_page_t msucc; msucc = NULL; if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_reserv_from_object: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_reserv_from_object: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, ("vm_reserv_from_object: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } rv = NULL; found: *msuccp = msucc; return (rv); } /* * Returns TRUE if the given reservation contains the given page index and * FALSE otherwise. */ static __inline boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) { return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0); } /* * Increases the given reservation's population count. Moves the reservation * to the tail of the partially populated reservation queue. */ static void vm_reserv_populate(vm_reserv_t rv, int index) { vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); KASSERT(rv->object != NULL, ("vm_reserv_populate: reserv %p is free", rv)); KASSERT(popmap_is_clear(rv->popmap, index), ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, index)); KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, ("vm_reserv_populate: reserv %p is already full", rv)); KASSERT(rv->pages->psind == 0, ("vm_reserv_populate: reserv %p is already promoted", rv)); KASSERT(rv->domain < vm_ndomains, ("vm_reserv_populate: reserv %p's domain is corrupted %d", rv, rv->domain)); popmap_set(rv->popmap, index); rv->popcnt++; if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP && rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES) return; rv->lasttick = ticks; vm_reserv_domain_lock(rv->domain); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); rv->inpartpopq = FALSE; } if (rv->popcnt < VM_LEVEL_0_NPAGES) { rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq); } else { KASSERT(rv->pages->psind == 0, ("vm_reserv_populate: reserv %p is already promoted", rv)); rv->pages->psind = 1; } vm_reserv_domain_unlock(rv->domain); } /* * Allocates a contiguous set of physical pages of the given size "npages" * from existing or newly created reservations. All of the physical pages * must be at or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross any * physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. * * The object must be locked. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t pa, size; vm_page_t m, m_ret, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; u_long allocpages, maxpages, minpages; int i, index, n; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex + npages > object->size) return (NULL); /* * All reservations of a particular size have the same alignment. * Assuming that the first page is allocated from a reservation, the * least significant bits of its physical address can be determined * from its offset from the beginning of the reservation and the size * of the reservation. * * Could the specified index within a reservation of the smallest * possible size satisfy the alignment and boundary requirements? */ pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; if ((pa & (alignment - 1)) != 0) return (NULL); size = npages << PAGE_SHIFT; if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* * Look for an existing reservation. */ rv = vm_reserv_from_object(object, pindex, mpred, &msucc); if (rv != NULL) { KASSERT(object != kernel_object || rv->domain == domain, ("vm_reserv_alloc_contig: domain mismatch")); index = VM_RESERV_INDEX(object, pindex); /* Does the allocation fit within the reservation? */ if (index + npages > VM_LEVEL_0_NPAGES) return (NULL); domain = rv->domain; vmd = VM_DOMAIN(domain); vm_reserv_lock(rv); /* Handle reclaim race. */ if (rv->object != object) goto out; m = &rv->pages[index]; pa = VM_PAGE_TO_PHYS(m); if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) goto out; /* Handle vm_page_rename(m, new_object, ...). */ for (i = 0; i < npages; i++) if (popmap_is_set(rv->popmap, index + i)) goto out; if (!vm_domain_allocate(vmd, req, npages)) goto out; for (i = 0; i < npages; i++) vm_reserv_populate(rv, index + i); vm_reserv_unlock(rv); return (m); out: vm_reserv_unlock(rv); return (NULL); } /* * Could at least one reservation fit between the first index to the * left that can be used ("leftcap") and the first index to the right * that cannot be used ("rightcap")? * * We must synchronize with the reserv object lock to protect the * pindex/object of the resulting reservations against rename while * we are inspecting. */ first = pindex - VM_RESERV_INDEX(object, pindex); minpages = VM_RESERV_INDEX(object, pindex) + npages; maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); allocpages = maxpages; vm_reserv_object_lock(object); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) { vm_reserv_object_unlock(object); return (NULL); } } if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + maxpages > rightcap) { if (maxpages == VM_LEVEL_0_NPAGES) { vm_reserv_object_unlock(object); return (NULL); } /* * At least one reservation will fit between "leftcap" * and "rightcap". However, a reservation for the * last of the requested pages will not fit. Reduce * the size of the upcoming allocation accordingly. */ allocpages = minpages; } } vm_reserv_object_unlock(object); /* * Would the last new reservation extend past the end of the object? + * + * If the object is unlikely to grow don't allocate a reservation for + * the tail. */ - if (first + maxpages > object->size) { - /* - * Don't allocate the last new reservation if the object is a - * vnode or backed by another object that is a vnode. - */ - if (object->type == OBJT_VNODE || - (object->backing_object != NULL && - object->backing_object->type == OBJT_VNODE)) { - if (maxpages == VM_LEVEL_0_NPAGES) - return (NULL); - allocpages = minpages; - } - /* Speculate that the object may grow. */ + if ((object->flags & OBJ_ANON) == 0 && + first + maxpages > object->size) { + if (maxpages == VM_LEVEL_0_NPAGES) + return (NULL); + allocpages = minpages; } /* * Allocate the physical pages. The alignment and boundary specified * for this allocation may be different from the alignment and * boundary specified for the requested pages. For instance, the * specified index may not be the first page within the first new * reservation. */ m = NULL; vmd = VM_DOMAIN(domain); if (vm_domain_allocate(vmd, req, npages)) { vm_domain_free_lock(vmd); m = vm_phys_alloc_contig(domain, allocpages, low, high, ulmax(alignment, VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, npages); return (NULL); } } else return (NULL); KASSERT(vm_phys_domain(m) == domain, ("vm_reserv_alloc_contig: Page domain does not match requested.")); /* * The allocated physical pages always begin at a reservation * boundary, but they do not always end at a reservation boundary. * Initialize every reservation that is completely covered by the * allocated physical pages. */ m_ret = NULL; index = VM_RESERV_INDEX(object, pindex); do { rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", rv)); vm_reserv_lock(rv); vm_reserv_insert(rv, object, first); n = ulmin(VM_LEVEL_0_NPAGES - index, npages); for (i = 0; i < n; i++) vm_reserv_populate(rv, index + i); npages -= n; if (m_ret == NULL) { m_ret = &rv->pages[index]; index = 0; } vm_reserv_unlock(rv); m += VM_LEVEL_0_NPAGES; first += VM_LEVEL_0_NPAGES; allocpages -= VM_LEVEL_0_NPAGES; } while (allocpages >= VM_LEVEL_0_NPAGES); return (m_ret); } /* * Allocate a physical page from an existing or newly created reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. * * The object must be locked. */ vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; int index; VM_OBJECT_ASSERT_WLOCKED(object); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex >= object->size) return (NULL); /* * Look for an existing reservation. */ rv = vm_reserv_from_object(object, pindex, mpred, &msucc); if (rv != NULL) { KASSERT(object != kernel_object || rv->domain == domain, ("vm_reserv_alloc_page: domain mismatch")); domain = rv->domain; vmd = VM_DOMAIN(domain); index = VM_RESERV_INDEX(object, pindex); m = &rv->pages[index]; vm_reserv_lock(rv); /* Handle reclaim race. */ if (rv->object != object || /* Handle vm_page_rename(m, new_object, ...). */ popmap_is_set(rv->popmap, index)) { m = NULL; goto out; } if (vm_domain_allocate(vmd, req, 1) == 0) m = NULL; else vm_reserv_populate(rv, index); out: vm_reserv_unlock(rv); return (m); } /* * Could a reservation fit between the first index to the left that * can be used and the first index to the right that cannot be used? * * We must synchronize with the reserv object lock to protect the * pindex/object of the resulting reservations against rename while * we are inspecting. */ first = pindex - VM_RESERV_INDEX(object, pindex); vm_reserv_object_lock(object); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) { vm_reserv_object_unlock(object); return (NULL); } } if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + VM_LEVEL_0_NPAGES > rightcap) { vm_reserv_object_unlock(object); return (NULL); } } vm_reserv_object_unlock(object); /* - * Would a new reservation extend past the end of the object? + * Would the last new reservation extend past the end of the object? + * + * If the object is unlikely to grow don't allocate a reservation for + * the tail. */ - if (first + VM_LEVEL_0_NPAGES > object->size) { - /* - * Don't allocate a new reservation if the object is a vnode or - * backed by another object that is a vnode. - */ - if (object->type == OBJT_VNODE || - (object->backing_object != NULL && - object->backing_object->type == OBJT_VNODE)) - return (NULL); - /* Speculate that the object may grow. */ - } + if ((object->flags & OBJ_ANON) == 0 && + first + VM_LEVEL_0_NPAGES > object->size) + return (NULL); /* * Allocate and populate the new reservation. */ m = NULL; vmd = VM_DOMAIN(domain); if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); return (NULL); } } else return (NULL); rv = vm_reserv_from_page(m); vm_reserv_lock(rv); KASSERT(rv->pages == m, ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); vm_reserv_insert(rv, object, first); index = VM_RESERV_INDEX(object, pindex); vm_reserv_populate(rv, index); vm_reserv_unlock(rv); return (&rv->pages[index]); } /* * Breaks the given reservation. All free pages in the reservation * are returned to the physical memory allocator. The reservation's * population count and map are reset to their initial state. * * The given reservation must not be in the partially populated reservation * queue. */ static void vm_reserv_break(vm_reserv_t rv) { u_long changes; int bitpos, hi, i, lo; vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); vm_reserv_remove(rv); rv->pages->psind = 0; hi = lo = -1; for (i = 0; i <= NPOPMAP; i++) { /* * "changes" is a bitmask that marks where a new sequence of * 0s or 1s begins in popmap[i], with last bit in popmap[i-1] * considered to be 1 if and only if lo == hi. The bits of * popmap[-1] and popmap[NPOPMAP] are considered all 1s. */ if (i == NPOPMAP) changes = lo != hi; else { changes = rv->popmap[i]; changes ^= (changes << 1) | (lo == hi); rv->popmap[i] = 0; } while (changes != 0) { /* * If the next change marked begins a run of 0s, set * lo to mark that position. Otherwise set hi and * free pages from lo up to hi. */ bitpos = ffsl(changes) - 1; changes ^= 1UL << bitpos; if (lo == hi) lo = NBPOPMAP * i + bitpos; else { hi = NBPOPMAP * i + bitpos; vm_domain_free_lock(VM_DOMAIN(rv->domain)); vm_phys_enqueue_contig(&rv->pages[lo], hi - lo); vm_domain_free_unlock(VM_DOMAIN(rv->domain)); lo = hi; } } } rv->popcnt = 0; counter_u64_add(vm_reserv_broken, 1); } /* * Breaks all reservations belonging to the given object. */ void vm_reserv_break_all(vm_object_t object) { vm_reserv_t rv; /* * This access of object->rvq is unsynchronized so that the * object rvq lock can nest after the domain_free lock. We * must check for races in the results. However, the object * lock prevents new additions, so we are guaranteed that when * it returns NULL the object is properly empty. */ while ((rv = LIST_FIRST(&object->rvq)) != NULL) { vm_reserv_lock(rv); /* Reclaim race. */ if (rv->object != object) { vm_reserv_unlock(rv); continue; } vm_reserv_domain_lock(rv->domain); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); rv->inpartpopq = FALSE; } vm_reserv_domain_unlock(rv->domain); vm_reserv_break(rv); vm_reserv_unlock(rv); } } /* * Frees the given page if it belongs to a reservation. Returns TRUE if the * page is freed and FALSE otherwise. */ boolean_t vm_reserv_free_page(vm_page_t m) { vm_reserv_t rv; boolean_t ret; rv = vm_reserv_from_page(m); if (rv->object == NULL) return (FALSE); vm_reserv_lock(rv); /* Re-validate after lock. */ if (rv->object != NULL) { vm_reserv_depopulate(rv, m - rv->pages); ret = TRUE; } else ret = FALSE; vm_reserv_unlock(rv); return (ret); } /* * Initializes the reservation management system. Specifically, initializes * the reservation array. * * Requires that vm_page_array and first_page are initialized! */ void vm_reserv_init(void) { vm_paddr_t paddr; struct vm_phys_seg *seg; struct vm_reserv *rv; int i, segind; /* * Initialize the reservation array. Specifically, initialize the * "pages" field for every element that has an underlying superpage. */ for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE > paddr && paddr + VM_LEVEL_0_SIZE <= seg->end) { rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; rv->pages = PHYS_TO_VM_PAGE(paddr); rv->domain = seg->domain; mtx_init(&rv->lock, "vm reserv", NULL, MTX_DEF); paddr += VM_LEVEL_0_SIZE; } } for (i = 0; i < MAXMEMDOM; i++) { mtx_init(&vm_rvd[i].lock, "VM reserv domain", NULL, MTX_DEF); TAILQ_INIT(&vm_rvd[i].partpop); } for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++) mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL, MTX_DEF); } /* * Returns true if the given page belongs to a reservation and that page is * free. Otherwise, returns false. */ bool vm_reserv_is_page_free(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); if (rv->object == NULL) return (false); return (popmap_is_clear(rv->popmap, m - rv->pages)); } /* * If the given page belongs to a reservation, returns the level of that * reservation. Otherwise, returns -1. */ int vm_reserv_level(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->object != NULL ? 0 : -1); } /* * Returns a reservation level if the given page belongs to a fully populated * reservation and -1 otherwise. */ int vm_reserv_level_iffullpop(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); } /* * Breaks the given partially populated reservation, releasing its free pages * to the physical memory allocator. */ static void vm_reserv_reclaim(vm_reserv_t rv) { vm_reserv_assert_locked(rv); CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq); vm_reserv_domain_lock(rv->domain); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); KASSERT(rv->domain < vm_ndomains, ("vm_reserv_reclaim: reserv %p's domain is corrupted %d", rv, rv->domain)); TAILQ_REMOVE(&vm_rvd[rv->domain].partpop, rv, partpopq); rv->inpartpopq = FALSE; vm_reserv_domain_unlock(rv->domain); vm_reserv_break(rv); counter_u64_add(vm_reserv_reclaimed, 1); } /* * Breaks the reservation at the head of the partially populated reservation * queue, releasing its free pages to the physical memory allocator. Returns * TRUE if a reservation is broken and FALSE otherwise. */ boolean_t vm_reserv_reclaim_inactive(int domain) { vm_reserv_t rv; while ((rv = TAILQ_FIRST(&vm_rvd[domain].partpop)) != NULL) { vm_reserv_lock(rv); if (rv != TAILQ_FIRST(&vm_rvd[domain].partpop)) { vm_reserv_unlock(rv); continue; } vm_reserv_reclaim(rv); vm_reserv_unlock(rv); return (TRUE); } return (FALSE); } /* * Determine whether this reservation has free pages that satisfy the given * request for contiguous physical memory. Start searching from the lower * bound, defined by low_index. */ static bool vm_reserv_test_contig(vm_reserv_t rv, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa, size; u_long changes; int bitpos, bits_left, i, hi, lo, n; vm_reserv_assert_locked(rv); size = npages << PAGE_SHIFT; pa = VM_PAGE_TO_PHYS(&rv->pages[0]); lo = (pa < low) ? ((low + PAGE_MASK - pa) >> PAGE_SHIFT) : 0; i = lo / NBPOPMAP; changes = rv->popmap[i] | ((1UL << (lo % NBPOPMAP)) - 1); hi = (pa + VM_LEVEL_0_SIZE > high) ? ((high + PAGE_MASK - pa) >> PAGE_SHIFT) : VM_LEVEL_0_NPAGES; n = hi / NBPOPMAP; bits_left = hi % NBPOPMAP; hi = lo = -1; for (;;) { /* * "changes" is a bitmask that marks where a new sequence of * 0s or 1s begins in popmap[i], with last bit in popmap[i-1] * considered to be 1 if and only if lo == hi. The bits of * popmap[-1] and popmap[NPOPMAP] are considered all 1s. */ changes ^= (changes << 1) | (lo == hi); while (changes != 0) { /* * If the next change marked begins a run of 0s, set * lo to mark that position. Otherwise set hi and * look for a satisfactory first page from lo up to hi. */ bitpos = ffsl(changes) - 1; changes ^= 1UL << bitpos; if (lo == hi) { lo = NBPOPMAP * i + bitpos; continue; } hi = NBPOPMAP * i + bitpos; pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); if ((pa & (alignment - 1)) != 0) { /* Skip to next aligned page. */ lo += (((pa - 1) | (alignment - 1)) + 1) >> PAGE_SHIFT; if (lo >= VM_LEVEL_0_NPAGES) return (false); pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); } if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { /* Skip to next boundary-matching page. */ lo += (((pa - 1) | (boundary - 1)) + 1) >> PAGE_SHIFT; if (lo >= VM_LEVEL_0_NPAGES) return (false); pa = VM_PAGE_TO_PHYS(&rv->pages[lo]); } if (lo * PAGE_SIZE + size <= hi * PAGE_SIZE) return (true); lo = hi; } if (++i < n) changes = rv->popmap[i]; else if (i == n) changes = bits_left == 0 ? -1UL : (rv->popmap[n] | (-1UL << bits_left)); else return (false); } } /* * Searches the partially populated reservation queue for the least recently * changed reservation with free pages that satisfy the given request for * contiguous physical memory. If a satisfactory reservation is found, it is * broken. Returns true if a reservation is broken and false otherwise. */ boolean_t vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa, size; vm_reserv_t rv, rvn; if (npages > VM_LEVEL_0_NPAGES - 1) return (false); size = npages << PAGE_SHIFT; vm_reserv_domain_lock(domain); again: for (rv = TAILQ_FIRST(&vm_rvd[domain].partpop); rv != NULL; rv = rvn) { rvn = TAILQ_NEXT(rv, partpopq); pa = VM_PAGE_TO_PHYS(&rv->pages[0]); if (pa + VM_LEVEL_0_SIZE - size < low) { /* This entire reservation is too low; go to next. */ continue; } if (pa + size > high) { /* This entire reservation is too high; go to next. */ continue; } if (vm_reserv_trylock(rv) == 0) { vm_reserv_domain_unlock(domain); vm_reserv_lock(rv); if (!rv->inpartpopq) { vm_reserv_domain_lock(domain); if (!rvn->inpartpopq) goto again; continue; } } else vm_reserv_domain_unlock(domain); if (vm_reserv_test_contig(rv, npages, low, high, alignment, boundary)) { vm_reserv_reclaim(rv); vm_reserv_unlock(rv); return (true); } vm_reserv_unlock(rv); vm_reserv_domain_lock(domain); if (rvn != NULL && !rvn->inpartpopq) goto again; } vm_reserv_domain_unlock(domain); return (false); } /* * Transfers the reservation underlying the given page to a new object. * * The object must be locked. */ void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset) { vm_reserv_t rv; VM_OBJECT_ASSERT_WLOCKED(new_object); rv = vm_reserv_from_page(m); if (rv->object == old_object) { vm_reserv_lock(rv); CTR6(KTR_VM, "%s: rv %p object %p new %p popcnt %d inpartpop %d", __FUNCTION__, rv, rv->object, new_object, rv->popcnt, rv->inpartpopq); if (rv->object == old_object) { vm_reserv_object_lock(old_object); rv->object = NULL; LIST_REMOVE(rv, objq); vm_reserv_object_unlock(old_object); vm_reserv_object_lock(new_object); rv->object = new_object; rv->pindex -= old_object_offset; LIST_INSERT_HEAD(&new_object->rvq, rv, objq); vm_reserv_object_unlock(new_object); } vm_reserv_unlock(rv); } } /* * Returns the size (in bytes) of a reservation of the specified level. */ int vm_reserv_size(int level) { switch (level) { case 0: return (VM_LEVEL_0_SIZE); case -1: return (PAGE_SIZE); default: return (0); } } /* * Allocates the virtual and physical memory required by the reservation * management system's data structures, in particular, the reservation array. */ vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end) { vm_paddr_t new_end, high_water; size_t size; int i; high_water = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].end > high_water) high_water = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i + 1] > high_water) high_water = phys_avail[i + 1]; } /* * Calculate the size (in bytes) of the reservation array. Round up * from "high_water" because every small page is mapped to an element * in the reservation array based on its physical address. Thus, the * number of elements in the reservation array can be greater than the * number of superpages. */ size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv); /* * Allocate and map the physical memory for the reservation array. The * next available virtual address is returned by reference. */ new_end = end - round_page(size); vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero(vm_reserv_array, size); /* * Return the next available physical address. */ return (new_end); } /* * Initializes the reservation management system. Specifically, initializes * the reservation counters. */ static void vm_reserv_counter_init(void *unused) { vm_reserv_freed = counter_u64_alloc(M_WAITOK); vm_reserv_broken = counter_u64_alloc(M_WAITOK); vm_reserv_reclaimed = counter_u64_alloc(M_WAITOK); } SYSINIT(vm_reserv_counter_init, SI_SUB_CPU, SI_ORDER_ANY, vm_reserv_counter_init, NULL); /* * Returns the superpage containing the given page. */ vm_page_t vm_reserv_to_superpage(vm_page_t m) { vm_reserv_t rv; VM_OBJECT_ASSERT_LOCKED(m->object); rv = vm_reserv_from_page(m); if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES) m = rv->pages; else m = NULL; return (m); } #endif /* VM_NRESERVLEVEL > 0 */