diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1aa05093f93a..8cb4c0006c59 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1,2812 +1,2813 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean); static void vm_object_backing_remove(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM object stats"); static COUNTER_U64_DEFINE_EARLY(object_collapses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, "VM object collapses"); static COUNTER_U64_DEFINE_EARLY(object_bypasses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, "VM object bypasses"); static COUNTER_U64_DEFINE_EARLY(object_collapse_waits); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD, &object_collapse_waits, "Number of sleeps for collapse"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(!vm_object_busied(object), ("object %p busy = %d", object, blockcount_read(&object->busy))); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; vm_radix_init(&object->rtree); refcount_init(&object->ref_count, 0); blockcount_init(&object->paging_in_progress); blockcount_init(&object->busy); object->resident_page_count = 0; object->shadow_count = 0; object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, u_short flags, vm_object_t object, void *handle) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; object->flags = flags; if ((flags & OBJ_SWAP) != 0) pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff() iteration over object_list * sees up to date type and pctrie head if it observed * non-dead object. */ atomic_thread_fence_rel(); object->pg_color = 0; object->size = size; object->domain.dr_policy = NULL; object->generation = 1; object->cleangeneration = 1; refcount_init(&object->ref_count, 1); object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = handle; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), OBJ_UNMANAGED, kernel_object, NULL); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif kernel_object->un_pager.phys.ops = &default_phys_pg_ops; /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. * * paging_in_progress is valid always. Lockless references to * the objects may acquire pip and then check OBJ_DEAD. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_DEAD) return (KERN_INVALID_ARGUMENT); if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { if (i > 0) blockcount_acquire(&object->paging_in_progress, i); } void vm_object_pip_wakeup(vm_object_t object) { vm_object_pip_wakeupn(object, 1); } void vm_object_pip_wakeupn(vm_object_t object, short i) { if (i > 0) blockcount_release(&object->paging_in_progress, i); } /* * Atomically drop the object lock and wait for pip to drain. This protects * from sleep/wakeup races due to identity changes. The lock is not re-acquired * on return. */ static void vm_object_pip_sleep(vm_object_t object, const char *waitid) { (void)blockcount_sleep(&object->paging_in_progress, &object->lock, waitid, PVM | PDROP); } void vm_object_pip_wait(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); blockcount_wait(&object->paging_in_progress, &object->lock, waitid, PVM); } void vm_object_pip_wait_unlocked(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_UNLOCKED(object); blockcount_wait(&object->paging_in_progress, NULL, waitid, PVM); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; u_short flags; switch (type) { case OBJT_DEAD: panic("vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: flags = OBJ_COLORED; break; case OBJT_SWAP: case OBJT_SWAP_TMPFS: flags = OBJ_COLORED | OBJ_SWAP; break; case OBJT_DEVICE: case OBJT_SG: flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: flags = OBJ_UNMANAGED; break; case OBJT_VNODE: flags = 0; break; default: panic("vm_object_allocate: type %d is undefined", type); } object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, flags, object, NULL); return (object); } /* * vm_object_allocate_anon: * * Returns a new default object of the given size and marked as * anonymous memory for special split/collapse handling. Color * to be initialized by the caller. */ vm_object_t vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object, struct ucred *cred, vm_size_t charge) { vm_object_t handle, object; if (backing_object == NULL) handle = NULL; else if ((backing_object->flags & OBJ_ANON) != 0) handle = backing_object->handle; else handle = backing_object; object = uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(OBJT_DEFAULT, size, OBJ_ANON | OBJ_ONEMAPPING, object, handle); object->cred = cred; object->charge = cred != NULL ? charge : 0; return (object); } static void vm_object_reference_vnode(vm_object_t object) { u_int old; /* * vnode objects need the lock for the first reference * to serialize with vnode_object_deallocate(). */ if (!refcount_acquire_if_gt(&object->ref_count, 0)) { VM_OBJECT_RLOCK(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); VM_OBJECT_RUNLOCK(object); } } /* * vm_object_reference: * * Acquires a reference to the given object. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; if (object->type == OBJT_VNODE) vm_object_reference_vnode(object); else refcount_acquire(&object->ref_count); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { u_int old; VM_OBJECT_ASSERT_LOCKED(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_deallocate_vnode(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; bool last; KASSERT(object->type == OBJT_VNODE, ("vm_object_deallocate_vnode: not a vnode object")); KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp")); /* Object lock to protect handle lookup. */ last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (!last) return; if (!umtx_shm_vnobj_persistent) umtx_shm_object_terminated(object); /* vrele may need the vnode lock. */ vrele(vp); } /* * We dropped a reference on an object and discovered that it had a * single remaining shadow. This is a sibling of the reference we * dropped. Attempt to collapse the sibling and backing object. */ static vm_object_t vm_object_deallocate_anon(vm_object_t backing_object) { vm_object_t object; /* Fetch the final shadow. */ object = LIST_FIRST(&backing_object->shadow_head); KASSERT(object != NULL && backing_object->shadow_count == 1, ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d", backing_object->ref_count, backing_object->shadow_count)); KASSERT((object->flags & OBJ_ANON) != 0, ("invalid shadow object %p", object)); if (!VM_OBJECT_TRYWLOCK(object)) { /* * Prevent object from disappearing since we do not have a * reference. */ vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); } else VM_OBJECT_WUNLOCK(backing_object); /* * Check for a collapse/terminate race with the last reference holder. */ if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 || !refcount_acquire_if_not_zero(&object->ref_count)) { VM_OBJECT_WUNLOCK(object); return (NULL); } backing_object = object->backing_object; if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0) vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); return (object); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; bool released; while (object != NULL) { /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. A ref count * of 1 may be a special case depending on the shadow count * being 0 or 1. These cases require a write lock on the * object. */ if ((object->flags & OBJ_ANON) == 0) released = refcount_release_if_gt(&object->ref_count, 1); else released = refcount_release_if_gt(&object->ref_count, 2); if (released) return; if (object->type == OBJT_VNODE) { VM_OBJECT_RLOCK(object); if (object->type == OBJT_VNODE) { vm_object_deallocate_vnode(object); return; } VM_OBJECT_RUNLOCK(object); } VM_OBJECT_WLOCK(object); KASSERT(object->ref_count > 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If this is not the final reference to an anonymous * object we may need to collapse the shadow chain. */ if (!refcount_release(&object->ref_count)) { if (object->ref_count > 1 || object->shadow_count == 0) { if ((object->flags & OBJ_ANON) != 0 && object->ref_count == 1) vm_object_set_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); return; } /* Handle collapsing last ref on anonymous objects. */ object = vm_object_deallocate_anon(object); continue; } /* * Handle the final reference to an object. We restart * the loop with the backing object to avoid recursion. */ umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { - KASSERT(object->type != OBJT_SWAP_TMPFS, + KASSERT(object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP, ("shadowed tmpfs v_object 2 %p", object)); vm_object_backing_remove(object); } KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_deallocate: Terminating dead object.")); vm_object_set_flag(object, OBJ_DEAD); vm_object_terminate(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } static void vm_object_backing_remove_locked(vm_object_t object) { vm_object_t backing_object; backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("vm_object_backing_remove: Removing collapsing object.")); if ((object->flags & OBJ_SHADOWLIST) != 0) { LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; object->flags &= ~OBJ_SHADOWLIST; } object->backing_object = NULL; } static void vm_object_backing_remove(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); if ((object->flags & OBJ_SHADOWLIST) != 0) { backing_object = object->backing_object; VM_OBJECT_WLOCK(backing_object); vm_object_backing_remove_locked(object); VM_OBJECT_WUNLOCK(backing_object); } else object->backing_object = NULL; } static void vm_object_backing_insert_locked(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_ASSERT_WLOCKED(backing_object); LIST_INSERT_HEAD(&backing_object->shadow_head, object, shadow_list); backing_object->shadow_count++; object->flags |= OBJ_SHADOWLIST; } object->backing_object = backing_object; } static void vm_object_backing_insert(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); vm_object_backing_insert_locked(object, backing_object); VM_OBJECT_WUNLOCK(backing_object); } else object->backing_object = backing_object; } /* * Insert an object into a backing_object's shadow list with an additional * reference to the backing_object added. */ static void vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); KASSERT((backing_object->flags & OBJ_DEAD) == 0, ("shadowing dead anonymous object")); vm_object_reference_locked(backing_object); vm_object_backing_insert_locked(object, backing_object); vm_object_clear_flag(backing_object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(backing_object); } else { vm_object_reference(backing_object); object->backing_object = backing_object; } } /* * Transfer a backing reference from backing_object to object. */ static void vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object) { vm_object_t new_backing_object; /* * Note that the reference to backing_object->backing_object * moves from within backing_object to within object. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object == NULL) return; if ((new_backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(new_backing_object); vm_object_backing_remove_locked(backing_object); vm_object_backing_insert_locked(object, new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); } else { object->backing_object = new_backing_object; backing_object->backing_object = NULL; } } /* * Wait for a concurrent collapse to settle. */ static void vm_object_collapse_wait(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); while ((object->flags & OBJ_COLLAPSING) != 0) { vm_object_pip_wait(object, "vmcolwait"); counter_u64_add(object_collapse_waits, 1); } } /* * Waits for a backing object to clear a pending collapse and returns * it locked if it is an ANON object. */ static vm_object_t vm_object_backing_collapse_wait(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); for (;;) { backing_object = object->backing_object; if (backing_object == NULL || (backing_object->flags & OBJ_ANON) == 0) return (NULL); VM_OBJECT_WLOCK(backing_object); if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0) break; VM_OBJECT_WUNLOCK(object); vm_object_pip_sleep(backing_object, "vmbckwait"); counter_u64_add(object_collapse_waits, 1); VM_OBJECT_WLOCK(object); } return (backing_object); } /* * vm_object_terminate_pages removes any remaining pageable pages * from the object and resets the object to an empty state. */ static void vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); KASSERT(p->object == object && (p->ref_count & VPRC_OBJREF) != 0, ("vm_object_terminate_pages: page %p is inconsistent", p)); p->object = NULL; if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) { VM_CNT_INC(v_pfree); vm_page_free(p); } } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("terminating non-dead obj %p", object)); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("terminating collapsing obj %p", object)); KASSERT(object->backing_object == NULL, ("terminating shadow obj %p", object)); /* * Wait for the pageout daemon and other current users to be * done with the object. Note that new paging_in_progress * users can come after this wait, but they must check * OBJ_DEAD flag set (without unlocking the object), and avoid * the object being terminated. */ vm_object_pip_wait(object, "objtrm"); KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || - object->type == OBJT_SWAP || object->type == OBJT_SWAP_TMPFS, + (object->flags & OBJ_SWAP) != 0, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean) { vm_page_assert_busied(p); /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->a.flags & PGA_NOSYNC) != 0) { *allclean = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PGA_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * For swap objects backing tmpfs regular files, do not flush anything, * but remove write protection on the mapped pages to update mtime through * mmaped writes. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t eio, res, allclean; VM_OBJECT_ASSERT_WLOCKED(object); if (!vm_object_mightbedirty(object) || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); allclean = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (vm_page_none_valid(p)) continue; if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) { if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &allclean)) { vm_page_xunbusy(p); continue; } if (object->type == OBJT_VNODE) { n = vm_object_page_collect_flush(object, p, pagerflags, flags, &allclean, &eio); if (eio) { res = FALSE; allclean = FALSE; } if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; allclean = FALSE; } } else { n = 1; vm_page_xunbusy(p); } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif /* * Leave updating cleangeneration for tmpfs objects to tmpfs * scan. It needs to update mtime, which happens for other * filesystems during page writeouts. */ if (allclean && object->type == OBJT_VNODE) object->cleangeneration = curgeneration; return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); vm_page_assert_xbusied(p); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && vm_object_mightbedirty(object) != 0 && ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->flags & (OBJ_ONEMAPPING | OBJ_ANON)) == (OBJ_ONEMAPPING | OBJ_ANON)); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE) vm_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. The page * can not be invalidated while the object lock is held. */ if (!vm_page_all_valid(tm) || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_tryxbusy(tm) == 0) { if (object != tobject) VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } vm_page_advise(tm, advice); vm_page_xunbusy(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length, struct ucred *cred, bool shared) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. * * If we hold the only reference we can guarantee that it won't * increase while we have the map locked. Otherwise the race is * harmless and we will end up with an extra shadow object that * will be collapsed later. */ if (source != NULL && source->ref_count == 1 && (source->flags & OBJ_ANON) != 0) return; /* * Allocate a new object with the given length. */ result = vm_object_allocate_anon(atop(length), source, cred, length); /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (shared || source != NULL) { VM_OBJECT_WLOCK(result); /* * The new object shadows the source object, adding a * reference to it. Our caller changes his reference * to point to the new object, removing a reference to * the source object. Net result: no change of * reference count, unless the caller needs to add one * more reference due to forking a shared map entry. */ if (shared) { vm_object_reference_locked(result); vm_object_clear_flag(result, OBJ_ONEMAPPING); } /* * Try to optimize the result object's page color when * shadowing in order to maintain page coloring * consistency in the combined shadowed object. */ if (source != NULL) { vm_object_backing_insert(result, source); result->domain = source->domain; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif } VM_OBJECT_WUNLOCK(result); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_busy, m_next; vm_object_t orig_object, new_object, backing_object; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0, ("vm_object_split: Splitting object with multiple mappings.")); if ((orig_object->flags & OBJ_ANON) == 0) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate_anon(size, orig_object, orig_object->cred, ptoa(size)); /* * We must wait for the orig_object to complete any in-progress * collapse so that the swap blocks are stable below. The * additional reference on backing_object by new object will * prevent further collapse operations until split completes. */ VM_OBJECT_WLOCK(orig_object); vm_object_collapse_wait(orig_object); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); new_object->domain = orig_object->domain; backing_object = orig_object->backing_object; if (backing_object != NULL) { vm_object_backing_insert_ref(new_object, backing_object); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; } if (orig_object->cred != NULL) { crhold(orig_object->cred); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } /* * Mark the split operation so that swap_pager_getpages() knows * that the object is in transition. */ vm_object_set_flag(orig_object, OBJ_SPLIT); m_busy = NULL; #ifdef INVARIANTS idx = 0; #endif retry: m = vm_page_find_least(orig_object, offidxstart); KASSERT(m == NULL || idx <= m->pindex - offidxstart, ("%s: object %p was repopulated", __func__, orig_object)); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_tryxbusy(m) == 0) { VM_OBJECT_WUNLOCK(new_object); vm_page_sleep_if_busy(m, "spltwt"); VM_OBJECT_WLOCK(new_object); goto retry; } /* * The page was left invalid. Likely placed there by * an incomplete fault. Just remove and ignore. */ if (vm_page_none_valid(m)) { if (vm_page_remove(m)) vm_page_free(m); continue; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { vm_page_xunbusy(m); VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif /* * orig_object's type may change while sleeping, so keep track * of the beginning of the busied range. */ if (orig_object->type != OBJT_SWAP) vm_page_xunbusy(m); else if (m_busy == NULL) m_busy = m; } if ((orig_object->flags & OBJ_SWAP) != 0) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); if (m_busy != NULL) TAILQ_FOREACH_FROM(m_busy, &new_object->memq, listq) vm_page_xunbusy(m_busy); } vm_object_clear_flag(orig_object, OBJ_SPLIT); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); /* The page is only NULL when rename fails. */ if (p == NULL) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); vm_radix_wait(); } else { if (p->object == object) VM_OBJECT_WUNLOCK(backing_object); else VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmocol", false); } VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if ((backing_object->flags & OBJ_ANON) == 0) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; if (p != NULL) { /* * If the backing object page is busy a * grandparent or older page may still be * undergoing CoW. It is not safe to collapse * the backing object until it is quiesced. */ if (vm_page_tryxbusy(p) == 0) return (false); /* * We raced with the fault handler that left * newly allocated invalid page on the object * queue and retried. */ if (!vm_page_all_valid(p)) goto unbusy_ret; } /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); /* * The valid check here is stable due to object lock * being required to clear valid and initiate paging. * Busy of p disallows fault handler to validate pp. */ if ((pp == NULL || vm_page_none_valid(pp)) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) goto unbusy_ret; if (p != NULL) vm_page_xunbusy(p); } return (true); unbusy_ret: if (p != NULL) vm_page_xunbusy(p); return (false); } static void vm_object_collapse_scan(vm_object_t object) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_tryxbusy(p) == 0) { next = vm_object_collapse_scan_wait(object, p); continue; } KASSERT(object->backing_object == backing_object, ("vm_object_collapse_scan: backing object mismatch %p != %p", object->backing_object, backing_object)); KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch %p != %p", p->object, backing_object)); if (p->pindex < backing_offset_index || new_pindex >= object->size) { vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } if (!vm_page_all_valid(p)) { KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_tryxbusy(pp) == 0) { vm_page_xunbusy(p); /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. */ next = vm_object_collapse_scan_wait(object, pp); continue; } if (pp != NULL && vm_page_none_valid(pp)) { /* * The page was invalid in the parent. Likely placed * there by an incomplete fault. Just remove and * ignore. p can replace it. */ if (vm_page_remove(pp)) vm_page_free(pp); pp = NULL; } if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); if (pp != NULL) vm_page_xunbusy(pp); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { vm_page_xunbusy(p); next = vm_object_collapse_scan_wait(object, NULL); continue; } /* Use the old pindex to free the right page. */ vm_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif vm_page_xunbusy(p); } return; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON, ("collapsing invalid object")); /* * Wait for the backing_object to finish any pending * collapse so that the caller sees the shortest possible * shadow chain. */ backing_object = vm_object_backing_collapse_wait(object); if (backing_object == NULL) return; KASSERT(object->ref_count > 0 && object->ref_count > object->shadow_count, ("collapse with invalid ref %d or shadow %d count.", object->ref_count, object->shadow_count)); KASSERT((backing_object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: Backing object already collapsing.")); KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: object is already collapsing.")); /* * We know that we can either collapse the backing object if * the parent is the only reference to it, or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. */ if (backing_object->ref_count == 1) { KASSERT(backing_object->shadow_count == 1, ("vm_object_collapse: shadow_count: %d", backing_object->shadow_count)); vm_object_pip_add(object, 1); vm_object_set_flag(object, OBJ_COLLAPSING); vm_object_pip_add(backing_object, 1); vm_object_set_flag(backing_object, OBJ_DEAD); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if ((backing_object->flags & OBJ_SWAP) != 0) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy backing_object, it will change the * type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. */ vm_object_clear_flag(object, OBJ_COLLAPSING); vm_object_backing_transfer(object, backing_object); object->backing_object_offset += backing_object->backing_object_offset; VM_OBJECT_WUNLOCK(object); vm_object_pip_wakeup(object); /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); (void)refcount_release(&backing_object->ref_count); vm_object_terminate(backing_object); counter_u64_add(object_collapses, 1); VM_OBJECT_WLOCK(object); } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. * * The object lock and backing_object lock must not * be dropped during this sequence. */ if (!vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object != NULL) { vm_object_backing_insert_ref(object, new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ (void)refcount_release(&backing_object->ref_count); KASSERT(backing_object->ref_count >= 1, ( "backing_object %p was somehow dereferenced during collapse!", backing_object)); VM_OBJECT_WUNLOCK(backing_object); counter_u64_add(object_bypasses, 1); } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ if (vm_page_tryxbusy(p) == 0) { vm_page_sleep_if_busy(p, "vmopar"); goto again; } if (vm_page_wired(p)) { wired: if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { vm_page_invalid(p); vm_page_undirty(p); } vm_page_xunbusy(p); continue; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && !vm_page_none_valid(p)) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) goto wired; if (p->dirty != 0) { vm_page_xunbusy(p); continue; } } if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_all(p)) goto wired; vm_page_free(p); } vm_object_pip_wakeup(object); vm_pager_freespace(object, start, (end == 0 ? object->size : end) - start); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); vm_page_deactivate_noreuse(p); } } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL); if (rv != VM_PAGER_OK) break; /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); if ((prev_object->flags & OBJ_ANON) == 0) return (FALSE); VM_OBJECT_WLOCK(prev_object); /* * Try to collapse the object first. */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if (prev_object->ref_count > 1 && prev_object->size != next_pindex && (prev_object->flags & OBJ_ONEMAPPING) == 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty_(vm_object_t object) { atomic_add_int(&object->generation, 1); } bool vm_object_mightbedirty_(vm_object_t object) { return (object->generation != object->cleangeneration); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } if (vm_page_trysbusy(tm) == 0) { for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; if (tm->object != tobject) VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } vm_page_busy_sleep(tm, "unwbo", true); goto again; } vm_page_unwire(tm, queue); vm_page_sunbusy(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } } /* * Return the vnode for the given object, or NULL if none exists. * For tmpfs objects, the function may return NULL if there is * no vnode allocated at the time of the call. */ struct vnode * vm_object_vnode(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); vm_pager_getvp(object, &vp, NULL); return (vp); } /* * Busy the vm object. This prevents new pages belonging to the object from * becoming busy. Existing pages persist as busy. Callers are responsible * for checking page state before proceeding. */ void vm_object_busy(vm_object_t obj) { VM_OBJECT_ASSERT_LOCKED(obj); blockcount_acquire(&obj->busy, 1); /* The fence is required to order loads of page busy. */ atomic_thread_fence_acq_rel(); } void vm_object_unbusy(vm_object_t obj) { blockcount_release(&obj->busy, 1); } void vm_object_busy_wait(vm_object_t obj, const char *wmesg) { VM_OBJECT_ASSERT_UNLOCKED(obj); (void)blockcount_sleep(&obj->busy, NULL, wmesg, PVM); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; u_long sp; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = obj->shadow_count; kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->a.queue == PQ_ACTIVE) kvo->kvo_active++; else if (m->a.queue == PQ_INACTIVE) kvo->kvo_inactive++; } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; kvo->kvo_type = vm_object_kvme_type(obj, &vp); if (vp != NULL) { vref(vp); } else if ((obj->flags & OBJ_ANON) != 0) { MPASS(kvo->kvo_type == KVME_TYPE_DEFAULT || kvo->kvo_type == KVME_TYPE_SWAP); kvo->kvo_me = (uintptr_t)obj; /* tmpfs objs are reported as vnodes */ kvo->kvo_backing_obj = (uintptr_t)obj->backing_object; sp = swap_pager_swapped_pages(obj); kvo->kvo_swapped = sp > UINT32_MAX ? UINT32_MAX : sp; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (map == 0) return 0; if (entry == 0) { VM_MAP_ENTRY_FOREACH(tmpe, map) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; VM_MAP_ENTRY_FOREACH(tmpe, tmpm) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if ((object->flags & OBJ_ANON) != 0) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } if (db_pager_quit) return; } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); if (db_pager_quit) break; } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 521fc4ce95f2..216e76359631 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1,2414 +1,2408 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); /* Pagedaemon activity rates, in subdivisions of one second. */ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 10 static int vm_pageout_oom_seq = 12; static int vm_pageout_update_period; static int disable_swap_pageouts; static int lowmem_period = 10; static int swapdev_enabled; static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "Panic on the given number of out-of-memory errors instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); static int pageout_cpus_per_thread = 16; SYSCTL_INT(_vm, OID_AUTO, pageout_cpus_per_thread, CTLFLAG_RDTUN, &pageout_cpus_per_thread, 0, "Number of CPUs per pagedaemon worker thread"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); static int act_scan_laundry_weight = 3; SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, "background laundering rate, in kilobytes per second"); static u_int vm_background_launder_max = 20 * 1024; SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); int vm_pageout_page_count = 32; u_long vm_page_max_user_wired; SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW, &vm_page_max_user_wired, 0, "system-wide limit to user-wired page count"); static u_int isqrt(u_int num); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); struct scan_state { struct vm_batchqueue bq; struct vm_pagequeue *pq; vm_page_t marker; int maxscan; int scanned; }; static void vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, vm_page_t marker, vm_page_t after, int maxscan) { vm_pagequeue_assert_locked(pq); KASSERT((marker->a.flags & PGA_ENQUEUED) == 0, ("marker %p already enqueued", marker)); if (after == NULL) TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q); else TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q); vm_page_aflag_set(marker, PGA_ENQUEUED); vm_batchqueue_init(&ss->bq); ss->pq = pq; ss->marker = marker; ss->maxscan = maxscan; ss->scanned = 0; vm_pagequeue_unlock(pq); } static void vm_pageout_end_scan(struct scan_state *ss) { struct vm_pagequeue *pq; pq = ss->pq; vm_pagequeue_assert_locked(pq); KASSERT((ss->marker->a.flags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); vm_page_aflag_clear(ss->marker, PGA_ENQUEUED); pq->pq_pdpages += ss->scanned; } /* * Add a small number of queued pages to a batch queue for later processing * without the corresponding queue lock held. The caller must have enqueued a * marker page at the desired start point for the scan. Pages will be * physically dequeued if the caller so requests. Otherwise, the returned * batch may contain marker pages, and it is up to the caller to handle them. * * When processing the batch queue, vm_pageout_defer() must be used to * determine whether the page has been logically dequeued since the batch was * collected. */ static __always_inline void vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) { struct vm_pagequeue *pq; vm_page_t m, marker, n; marker = ss->marker; pq = ss->pq; KASSERT((marker->a.flags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); vm_pagequeue_lock(pq); for (m = TAILQ_NEXT(marker, plinks.q); m != NULL && ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE; m = n, ss->scanned++) { n = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) == 0) { KASSERT((m->a.flags & PGA_ENQUEUED) != 0, ("page %p not enqueued", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in page queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in page queue", m)); } else if (dequeue) continue; (void)vm_batchqueue_insert(&ss->bq, m); if (dequeue) { TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_page_aflag_clear(m, PGA_ENQUEUED); } } TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q); if (__predict_true(m != NULL)) TAILQ_INSERT_BEFORE(m, marker, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q); if (dequeue) vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt); vm_pagequeue_unlock(pq); } /* * Return the next page to be scanned, or NULL if the scan is complete. */ static __always_inline vm_page_t vm_pageout_next(struct scan_state *ss, const bool dequeue) { if (ss->bq.bq_cnt == 0) vm_pageout_collect_batch(ss, dequeue); return (vm_batchqueue_pop(&ss->bq)); } /* * Determine whether processing of a page should be deferred and ensure that any * outstanding queue operations are processed. */ static __always_inline bool vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued) { vm_page_astate_t as; as = vm_page_astate_load(m); if (__predict_false(as.queue != queue || ((as.flags & PGA_ENQUEUED) != 0) != enqueued)) return (true); if ((as.flags & PGA_QUEUE_OP_MASK) != 0) { vm_page_pqbatch_submit(m, queue); return (true); } return (false); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; vm_page_assert_xbusied(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_tryxbusy(p) == 0) { ib = 0; break; } if (vm_page_wired(p)) { ib = 0; vm_page_xunbusy(p); break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; vm_page_xunbusy(p); break; } if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); ib = 0; break; } mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_tryxbusy(p) == 0) break; if (vm_page_wired(p)) { vm_page_xunbusy(p); break; } vm_page_test_dirty(p); if (p->dirty == 0) { vm_page_xunbusy(p); break; } if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); break; } mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Mark the pages shared busy and verify that they're * valid and read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(vm_page_all_valid(mc[i]), ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); KASSERT((mc[i]->a.flags & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_busy_downgrade(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: /* * The page may have moved since laundering started, in * which case it should be left alone. */ if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * The page is outside the object's range. We pretend * that the page out worked and clean the page, so the * changes will be lost if the page is reclaimed by * the page daemon. */ vm_page_undirty(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If the page couldn't be paged out to swap because the * pager wasn't able to find space, place the page in * the PQ_UNSWAPPABLE holding queue. This is an * optimization that prevents the page daemon from * wasting CPU cycles on pages that cannot be reclaimed * becase no swap device is configured. * * Otherwise, reactivate the page so that it doesn't * clog the laundry and inactive queues. (We will try * paging it out again later.) */ if ((object->flags & OBJ_SWAP) != 0 && pageout_status[i] == VM_PAGER_FAIL) { vm_page_unswappable(mt); numpagedout++; } else vm_page_activate(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } static void vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) { atomic_store_rel_int(&swapdev_enabled, 1); } static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) { if (swap_pager_nswapdev() == 1) atomic_store_rel_int(&swapdev_enabled, 0); } /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_xunbusy(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); /* * Ensure that the object and vnode were not disassociated * while locks were dropped. */ if (vp->v_object != object) { error = ENOENT; goto unlock_all; } /* * While the object was unlocked, the page may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { error = ENXIO; goto unlock_all; } /* * The page may have been busied while the object lock was * released. */ if (vm_page_tryxbusy(m) == 0) { error = EBUSY; goto unlock_all; } } /* * Remove all writeable mappings, failing if the page is wired. */ if (!vm_page_try_remove_write(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock_all; } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * Attempt to launder the specified number of pages. * * Returns the number of pages successfully laundered. */ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct scan_state ss; struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, marker; vm_page_astate_t new, old; int act_delta, error, numpagedout, queue, refs, starting_target; int vnodes_skipped; bool pageout_ok; object = NULL; starting_target = launder; vnodes_skipped = 0; /* * Scan the laundry queues for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once * we've reached the end of the queue. A single iteration of this loop * may cause more than one page to be laundered because of clustering. * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) queue = PQ_UNSWAPPABLE; else queue = PQ_LAUNDRY; scan: marker = &vmd->vmd_markers[queue]; pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false((m->flags & PG_MARKER) != 0)) continue; /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, queue, true)) continue; /* * Lock the page's object. */ if (object == NULL || object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* The page is being freed by another thread. */ continue; /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); if (__predict_false(m->object != object)) { VM_OBJECT_WUNLOCK(object); object = NULL; continue; } } if (vm_page_tryxbusy(m) == 0) continue; /* * Check for wirings now that we hold the object lock and have * exclusively busied the page. If the page is mapped, it may * still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase after this check. */ if (__predict_false(vm_page_wired(m))) goto skip_page; /* * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ if (vm_page_none_valid(m)) goto free_page; refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; for (old = vm_page_astate_load(m);;) { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) goto skip_page; new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta == 0) { ; } else if (object->ref_count != 0) { /* * Increase the activation count if the page was * referenced while in the laundry queue. This * makes it less likely that the page will be * returned prematurely to the laundry queue. */ new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; if (!vm_page_pqstate_commit(m, &old, new)) continue; /* * If this was a background laundering, count * activated pages towards our target. The * purpose of background laundering is to ensure * that pages are eventually cycled through the * laundry queue, and an activation is a valid * way out. */ if (!in_shortfall) launder--; VM_CNT_INC(v_reactivated); goto skip_page; } else if ((object->flags & OBJ_DEAD) == 0) { new.flags |= PGA_REQUEUE; if (!vm_page_pqstate_commit(m, &old, new)) continue; goto skip_page; } break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) goto skip_page; } /* * Clean pages are freed, and dirty pages are paged out unless * they belong to a dead object. Requeueing dirty pages from * dead objects is pointless, as they are being paged out and * freed by the thread that destroyed the object. */ if (m->dirty == 0) { free_page: /* * Now we are guaranteed that no other threads are * manipulating the page, check for a last-second * reference. */ if (vm_pageout_defer(m, queue, true)) goto skip_page; vm_page_free(m); VM_CNT_INC(v_dfree); } else if ((object->flags & OBJ_DEAD) == 0) { if ((object->flags & OBJ_SWAP) == 0 && object->type != OBJT_DEFAULT) pageout_ok = true; else if (disable_swap_pageouts) pageout_ok = false; else pageout_ok = true; if (!pageout_ok) { vm_page_launder(m); goto skip_page; } /* * Form a cluster with adjacent, dirty pages from the * same object, and page out that entire cluster. * * The adjacent, dirty pages must also be in the * laundry. However, their mappings are not checked * for new references. Consequently, a recently * referenced page may be paged out. However, that * page will not be prematurely reclaimed. After page * out, the page will be placed in the inactive queue, * where any new references will be detected and the * page reactivated. */ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; ss.scanned += numpagedout; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } object = NULL; } else { skip_page: vm_page_xunbusy(m); } } if (object != NULL) { VM_OBJECT_WUNLOCK(object); object = NULL; } vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); if (launder > 0 && queue == PQ_UNSWAPPABLE) { queue = PQ_LAUNDRY; goto scan; } /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ if (vnodes_skipped > 0 && launder > 0) (void)speedup_syncer(); return (starting_target - launder); } /* * Compute the integer square root. */ static u_int isqrt(u_int num) { u_int bit, root, tmp; bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0; root = 0; while (bit != 0) { tmp = root + bit; root >>= 1; if (num >= tmp) { num -= tmp; root += bit; } bit >>= 2; } return (root); } /* * Perform the work of the laundry thread: periodically wake up and determine * whether any pages need to be laundered. If so, determine the number of pages * that need to be laundered, and launder them. */ static void vm_pageout_laundry_worker(void *arg) { struct vm_domain *vmd; struct vm_pagequeue *pq; uint64_t nclean, ndirty, nfreed; int domain, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(vmd->vmd_segs != 0, ("domain without segments")); shortfall = 0; in_shortfall = false; shortfall_cycle = 0; last_target = target = 0; nfreed = 0; /* * Calls to these handlers are serialized by the swap syscall lock. */ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd, EVENTHANDLER_PRI_ANY); (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd, EVENTHANDLER_PRI_ANY); /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { KASSERT(target >= 0, ("negative target %d", target)); KASSERT(shortfall_cycle >= 0, ("negative cycle %d", shortfall_cycle)); launder = 0; /* * First determine whether we need to launder pages to meet a * shortage of free pages. */ if (shortfall > 0) { in_shortfall = true; shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run * (and we are no longer in shortfall) or we have met * our laundry target through other activity, then we * can stop laundering pages. */ in_shortfall = false; target = 0; goto trybackground; } launder = target / shortfall_cycle--; goto dolaundry; /* * There's no immediate need to launder any pages; see if we * meet the conditions to perform background laundering: * * 1. The ratio of dirty to clean inactive pages exceeds the * background laundering threshold, or * 2. we haven't yet reached the target of the current * background laundering run. * * The background laundering threshold is not a constant. * Instead, it is a slowly growing function of the number of * clean pages freed by the page daemon since the last * background laundering. Thus, as the ratio of dirty to * clean inactive pages grows, the amount of memory pressure * required to trigger laundering decreases. We ensure * that the threshold is non-zero after an inactive queue * scan, even if that scan failed to free a single clean page. */ trybackground: nclean = vmd->vmd_free_count + vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt; ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt; if (target == 0 && ndirty * isqrt(howmany(nfreed + 1, vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) { target = vmd->vmd_background_launder_target; } /* * We have a non-zero background laundering target. If we've * laundered up to our maximum without observing a page daemon * request, just stop. This is a safety belt that ensures we * don't launder an excessive amount if memory pressure is low * and the ratio of dirty to clean pages is large. Otherwise, * proceed at the background laundering rate. */ if (target > 0) { if (nfreed > 0) { nfreed = 0; last_target = target; } else if (last_target - target >= vm_background_launder_max * PAGE_SIZE / 1024) { target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } dolaundry: if (launder > 0) { /* * Because of I/O clustering, the number of laundered * pages could exceed "target" by the maximum size of * a cluster minus one. */ target -= min(vm_pageout_launder(vmd, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } /* * If we're not currently laundering pages and the page daemon * hasn't posted a new request, sleep until the page daemon * kicks us. */ vm_pagequeue_lock(pq); if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE) (void)mtx_sleep(&vmd->vmd_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* * If the pagedaemon has indicated that it's in shortfall, start * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { shortfall = vm_laundry_target(vmd) + vmd->vmd_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) vmd->vmd_laundry_request = VM_LAUNDRY_IDLE; nfreed += vmd->vmd_clean_pages_freed; vmd->vmd_clean_pages_freed = 0; vm_pagequeue_unlock(pq); } } /* * Compute the number of pages we want to try to move from the * active queue to either the inactive or laundry queue. * * When scanning active pages during a shortage, we make clean pages * count more heavily towards the page shortage than dirty pages. * This is because dirty pages must be laundered before they can be * reused and thus have less utility when attempting to quickly * alleviate a free page shortage. However, this weighting also * causes the scan to deactivate dirty pages more aggressively, * improving the effectiveness of clustering. */ static int vm_pageout_active_target(struct vm_domain *vmd) { int shortage; shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) - (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt + vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight); shortage *= act_scan_laundry_weight; return (shortage); } /* * Scan the active queue. If there is no shortage of inactive pages, scan a * small portion of the queue in order to maintain quasi-LRU. */ static void vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) { struct scan_state ss; vm_object_t object; vm_page_t m, marker; struct vm_pagequeue *pq; vm_page_astate_t old, new; long min_scan; int act_delta, max_scan, ps_delta, refs, scan_tick; uint8_t nqueue; marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. * * To avoid requeuing each page that remains in the active queue, we * implement the CLOCK algorithm. To keep the implementation of the * enqueue operation consistent for all page queues, we use two hands, * represented by marker pages. Scans begin at the first hand, which * precedes the second hand in the queue. When the two hands meet, * they are moved back to the head and tail of the queue, respectively, * and scanning resumes. */ max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan; act_scan: vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan); while ((m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false(m == &vmd->vmd_clock[1])) { vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); max_scan -= ss.scanned; vm_pageout_end_scan(&ss); goto act_scan; } if (__predict_false((m->flags & PG_MARKER) != 0)) continue; /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, PQ_ACTIVE, true)) continue; /* * A page's object pointer may be set to NULL before * the object lock is acquired. */ object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* * The page has been removed from its object. */ continue; /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0 && VM_OBJECT_TRYWLOCK(object)) { if (m->object == object) vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(object); } /* * Check to see "how much" the page has been used. * * Test PGA_REFERENCED after calling pmap_ts_referenced() so * that a reference from a concurrently destroyed mapping is * observed here and now. * * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; old = vm_page_astate_load(m); do { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) { ps_delta = 0; break; } /* * Advance or decay the act_count based on recent usage. */ new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta != 0) { new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; } else { new.act_count -= min(new.act_count, ACT_DECLINE); } if (new.act_count > 0) { /* * Adjust the activation count and keep the page * in the active queue. The count might be left * unchanged if it is saturated. The page may * have been moved to a different queue since we * started the scan, in which case we move it * back. */ ps_delta = 0; if (old.queue != PQ_ACTIVE) { new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; } } else { /* * When not short for inactive pages, let dirty * pages go through the inactive queue before * moving to the laundry queue. This gives them * some extra time to be reactivated, * potentially avoiding an expensive pageout. * However, during a page shortage, the inactive * queue is necessarily small, and so dirty * pages would only spend a trivial amount of * time in the inactive queue. Therefore, we * might as well place them directly in the * laundry queue to reduce queuing overhead. * * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, * directing dirty pages into the laundry queue * is only an optimization and not a * requirement. Therefore, we simply rely on * the opportunistic updates to the page's dirty * field by the pmap. */ if (page_shortage <= 0) { nqueue = PQ_INACTIVE; ps_delta = 0; } else if (m->dirty == 0) { nqueue = PQ_INACTIVE; ps_delta = act_scan_laundry_weight; } else { nqueue = PQ_LAUNDRY; ps_delta = 1; } new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); page_shortage -= ps_delta; } vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); } static int vm_pageout_reinsert_inactive_page(struct vm_pagequeue *pq, vm_page_t marker, vm_page_t m) { vm_page_astate_t as; vm_pagequeue_assert_locked(pq); as = vm_page_astate_load(m); if (as.queue != PQ_INACTIVE || (as.flags & PGA_ENQUEUED) != 0) return (0); vm_page_aflag_set(m, PGA_ENQUEUED); TAILQ_INSERT_BEFORE(marker, m, plinks.q); return (1); } /* * Re-add stuck pages to the inactive queue. We will examine them again * during the next scan. If the queue state of a page has changed since * it was physically removed from the page queue in * vm_pageout_collect_batch(), don't do anything with that page. */ static void vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq, vm_page_t m) { struct vm_pagequeue *pq; vm_page_t marker; int delta; delta = 0; marker = ss->marker; pq = ss->pq; if (m != NULL) { if (vm_batchqueue_insert(bq, m)) return; vm_pagequeue_lock(pq); delta += vm_pageout_reinsert_inactive_page(pq, marker, m); } else vm_pagequeue_lock(pq); while ((m = vm_batchqueue_pop(bq)) != NULL) delta += vm_pageout_reinsert_inactive_page(pq, marker, m); vm_pagequeue_cnt_add(pq, delta); vm_pagequeue_unlock(pq); vm_batchqueue_init(bq); } static void vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) { struct timeval start, end; struct scan_state ss; struct vm_batchqueue rq; struct vm_page marker_page; vm_page_t m, marker; struct vm_pagequeue *pq; vm_object_t object; vm_page_astate_t old, new; int act_delta, addl_page_shortage, starting_page_shortage, refs; object = NULL; vm_batchqueue_init(&rq); getmicrouptime(&start); /* * The addl_page_shortage is an estimate of the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->a.act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ starting_page_shortage = page_shortage; marker = &marker_page; vm_page_init_marker(marker, PQ_INACTIVE, 0); pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) { KASSERT((m->flags & PG_MARKER) == 0, ("marker page %p was dequeued", m)); /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, PQ_INACTIVE, false)) continue; /* * Lock the page's object. */ if (object == NULL || object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* The page is being freed by another thread. */ continue; /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); if (__predict_false(m->object != object)) { VM_OBJECT_WUNLOCK(object); object = NULL; goto reinsert; } } if (vm_page_tryxbusy(m) == 0) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; goto reinsert; } /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); /* * Check for wirings now that we hold the object lock and have * exclusively busied the page. If the page is mapped, it may * still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase after this check. */ if (__predict_false(vm_page_wired(m))) goto skip_page; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (vm_page_none_valid(m)) goto free_page; refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; for (old = vm_page_astate_load(m);;) { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) goto skip_page; new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta == 0) { ; } else if (object->ref_count != 0) { /* * Increase the activation count if the * page was referenced while in the * inactive queue. This makes it less * likely that the page will be returned * prematurely to the inactive queue. */ new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; if (!vm_page_pqstate_commit(m, &old, new)) continue; VM_CNT_INC(v_reactivated); goto skip_page; } else if ((object->flags & OBJ_DEAD) == 0) { new.queue = PQ_INACTIVE; new.flags |= PGA_REQUEUE; if (!vm_page_pqstate_commit(m, &old, new)) continue; goto skip_page; } break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) goto skip_page; } /* * Clean pages can be freed, but dirty pages must be sent back * to the laundry, unless they belong to a dead object. * Requeueing dirty pages from dead objects is pointless, as * they are being paged out and freed by the thread that * destroyed the object. */ if (m->dirty == 0) { free_page: /* * Now we are guaranteed that no other threads are * manipulating the page, check for a last-second * reference that would save it from doom. */ if (vm_pageout_defer(m, PQ_INACTIVE, false)) goto skip_page; /* * Because we dequeued the page and have already checked * for pending dequeue and enqueue requests, we can * safely disassociate the page from the inactive queue * without holding the queue lock. */ m->a.queue = PQ_NONE; vm_page_free(m); page_shortage--; continue; } if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); skip_page: vm_page_xunbusy(m); continue; reinsert: vm_pageout_reinsert_inactive(&ss, &rq, m); } if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_pageout_reinsert_inactive(&ss, &rq, NULL); vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL); vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); /* * Record the remaining shortage and the progress and rate it was made. */ atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage); getmicrouptime(&end); timevalsub(&end, &start); atomic_add_int(&vmd->vmd_inactive_us, end.tv_sec * 1000000 + end.tv_usec); atomic_add_int(&vmd->vmd_inactive_freed, starting_page_shortage - page_shortage); } /* * Dispatch a number of inactive threads according to load and collect the * results to present a coherent view of paging activity on this domain. */ static int vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage) { u_int freed, pps, slop, threads, us; vmd->vmd_inactive_shortage = shortage; slop = 0; /* * If we have more work than we can do in a quarter of our interval, we * fire off multiple threads to process it. */ threads = vmd->vmd_inactive_threads; if (threads > 1 && vmd->vmd_inactive_pps != 0 && shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) { vmd->vmd_inactive_shortage /= threads; slop = shortage % threads; vm_domain_pageout_lock(vmd); blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1); blockcount_acquire(&vmd->vmd_inactive_running, threads - 1); wakeup(&vmd->vmd_inactive_shortage); vm_domain_pageout_unlock(vmd); } /* Run the local thread scan. */ vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage + slop); /* * Block until helper threads report results and then accumulate * totals. */ blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM); freed = atomic_readandclear_int(&vmd->vmd_inactive_freed); VM_CNT_ADD(v_dfree, freed); /* * Calculate the per-thread paging rate with an exponential decay of * prior results. Careful to avoid integer rounding errors with large * us values. */ us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1); if (us > 1000000) /* Keep rounding to tenths */ pps = (freed * 10) / ((us * 10) / 1000000); else pps = (1000000 / us) * freed; vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2); return (shortage - freed); } /* * Attempt to reclaim the requested number of pages from the inactive queue. * Returns true if the shortage was addressed. */ static int vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage) { struct vm_pagequeue *pq; u_int addl_page_shortage, deficit, page_shortage; u_int starting_page_shortage; /* * vmd_pageout_deficit counts the number of pages requested in * allocations that failed because of a free page shortage. We assume * that the allocations will be reattempted and thus include the deficit * in our scan target. */ deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); starting_page_shortage = shortage + deficit; /* * Run the inactive scan on as many threads as is necessary. */ page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage); addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage); /* * Wake up the laundry thread so that it can perform any needed * laundering. If we didn't meet our target, we're in shortfall and * need to launder more aggressively. If PQ_LAUNDRY is empty and no * swap devices are configured, the laundry thread has no work to do, so * don't bother waking it up. * * The laundry thread uses the number of inactive queue scans elapsed * since the last laundering to determine whether to launder again, so * keep count. */ if (starting_page_shortage > 0) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE && (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { if (page_shortage > 0) { vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL; VM_CNT_INC(v_pdshortfalls); } else if (vmd->vmd_laundry_request != VM_LAUNDRY_SHORTFALL) vmd->vmd_laundry_request = VM_LAUNDRY_BACKGROUND; wakeup(&vmd->vmd_laundry_request); } vmd->vmd_clean_pages_freed += starting_page_shortage - page_shortage; vm_pagequeue_unlock(pq); } /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (page_shortage > 0) vm_swapout_run(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Reclaim pages by swapping out idle processes, if configured to do so. */ vm_swapout_run_idle(); /* * See the description of addl_page_shortage above. */ *addl_shortage = addl_page_shortage + deficit; return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; VM_MAP_ENTRY_FOREACH(entry, map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; - switch (obj->type) { - case OBJT_DEFAULT: - case OBJT_SWAP: - case OBJT_SWAP_TMPFS: - case OBJT_PHYS: - case OBJT_VNODE: + if (obj->type == OBJT_DEFAULT || obj->type == OBJT_PHYS || + obj->type == OBJT_VNODE || (obj->flags & OBJ_SWAP) != 0) res += obj->resident_page_count; - break; - } } return (res); } static int vm_oom_ratelim_last; static int vm_oom_pf_secs = 10; SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0, ""); static struct mtx vm_oom_ratelim_mtx; void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; int now; bool breakout; /* * For OOM requests originating from vm_fault(), there is a high * chance that a single large process faults simultaneously in * several threads. Also, on an active system running many * processes of middle-size, like buildworld, all of them * could fault almost simultaneously as well. * * To avoid killing too many processes, rate-limit OOMs * initiated by vm_fault() time-outs on the waits for free * pages. */ mtx_lock(&vm_oom_ratelim_mtx); now = ticks; if (shortage == VM_OOM_MEM_PF && (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) { mtx_unlock(&vm_oom_ratelim_mtx); return; } vm_oom_ratelim_last = now; mtx_unlock(&vm_oom_ratelim_mtx); /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = false; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = true; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); } } /* * Signal a free page shortage to subsystems that have registered an event * handler. Reclaim memory from UMA in the event of a severe shortage. * Return true if the free page count should be re-evaluated. */ static bool vm_pageout_lowmem(void) { static int lowmem_ticks = 0; int last; bool ret; ret = false; last = atomic_load_int(&lowmem_ticks); while ((u_int)(ticks - last) / hz >= lowmem_period) { if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0) continue; /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(UMA_RECLAIM_TRIM); ret = true; break; } /* * Kick off an asynchronous reclaim of cached memory if one of the * page daemons is failing to keep up with demand. Use the "severe" * threshold instead of "min" to ensure that we do not blow away the * caches if a subset of the NUMA domains are depleted by kernel memory * allocations; the domainset iterators automatically skip domains * below the "min" threshold on the first pass. * * UMA reclaim worker has its own rate-limiting mechanism, so don't * worry about kicking it too often. */ if (vm_page_count_severe()) uma_reclaim_wakeup(); return (ret); } static void vm_pageout_worker(void *arg) { struct vm_domain *vmd; u_int ofree; int addl_shortage, domain, shortage; bool target_met; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); shortage = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vmd->vmd_last_active_scan = ticks; /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { vm_domain_pageout_lock(vmd); /* * We need to clear wanted before we check the limits. This * prevents races with wakers who will check wanted after they * reach the limit. */ atomic_store_int(&vmd->vmd_pageout_wanted, 0); /* * Might the page daemon need to run again? */ if (vm_paging_needed(vmd, vmd->vmd_free_count)) { /* * Yes. If the scan failed to produce enough free * pages, sleep uninterruptibly for some time in the * hope that the laundry thread will clean some pages. */ vm_domain_pageout_unlock(vmd); if (!target_met) pause("pwait", hz / VM_INACT_SCAN_RATE); } else { /* * No, sleep until the next wakeup or until pages * need to have their reference stats updated. */ if (mtx_sleep(&vmd->vmd_pageout_wanted, vm_domain_pageout_lockptr(vmd), PDROP | PVM, "psleep", hz / VM_INACT_SCAN_RATE) == 0) VM_CNT_INC(v_pdwakeups); } /* Prevent spurious wakeups by ensuring that wanted is set. */ atomic_store_int(&vmd->vmd_pageout_wanted, 1); /* * Use the controller to calculate how many pages to free in * this interval, and scan the inactive queue. If the lowmem * handlers appear to have freed up some pages, subtract the * difference from the inactive queue scan target. */ shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count); if (shortage > 0) { ofree = vmd->vmd_free_count; if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree) shortage -= min(vmd->vmd_free_count - ofree, (u_int)shortage); target_met = vm_pageout_inactive(vmd, shortage, &addl_shortage); } else addl_shortage = 0; /* * Scan the active queue. A positive value for shortage * indicates that we must aggressively deactivate pages to avoid * a shortfall. */ shortage = vm_pageout_active_target(vmd) + addl_shortage; vm_pageout_scan_active(vmd, shortage); } } /* * vm_pageout_helper runs additional pageout daemons in times of high paging * activity. */ static void vm_pageout_helper(void *arg) { struct vm_domain *vmd; int domain; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); vm_domain_pageout_lock(vmd); for (;;) { msleep(&vmd->vmd_inactive_shortage, vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0); blockcount_release(&vmd->vmd_inactive_starting, 1); vm_domain_pageout_unlock(vmd); vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage); vm_domain_pageout_lock(vmd); /* * Release the running count while the pageout lock is held to * prevent wakeup races. */ blockcount_release(&vmd->vmd_inactive_running, 1); } } static int get_pageout_threads_per_domain(const struct vm_domain *vmd) { unsigned total_pageout_threads, eligible_cpus, domain_cpus; if (VM_DOMAIN_EMPTY(vmd->vmd_domain)) return (0); /* * Semi-arbitrarily constrain pagedaemon threads to less than half the * total number of CPUs in the system as an upper limit. */ if (pageout_cpus_per_thread < 2) pageout_cpus_per_thread = 2; else if (pageout_cpus_per_thread > mp_ncpus) pageout_cpus_per_thread = mp_ncpus; total_pageout_threads = howmany(mp_ncpus, pageout_cpus_per_thread); domain_cpus = CPU_COUNT(&cpuset_domain[vmd->vmd_domain]); /* Pagedaemons are not run in empty domains. */ eligible_cpus = mp_ncpus; for (unsigned i = 0; i < vm_ndomains; i++) if (VM_DOMAIN_EMPTY(i)) eligible_cpus -= CPU_COUNT(&cpuset_domain[i]); /* * Assign a portion of the total pageout threads to this domain * corresponding to the fraction of pagedaemon-eligible CPUs in the * domain. In asymmetric NUMA systems, domains with more CPUs may be * allocated more threads than domains with fewer CPUs. */ return (howmany(total_pageout_threads * domain_cpus, eligible_cpus)); } /* * Initialize basic pageout daemon settings. See the comment above the * definition of vm_domain for some explanation of how these thresholds are * used. */ static void vm_pageout_init_domain(int domain) { struct vm_domain *vmd; struct sysctl_oid *oid; vmd = VM_DOMAIN(domain); vmd->vmd_interrupt_free_min = 2; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE + vmd->vmd_interrupt_free_min; vmd->vmd_free_reserved = vm_pageout_page_count + vmd->vmd_pageout_free_min + vmd->vmd_page_count / 768; vmd->vmd_free_min = vmd->vmd_page_count / 200; vmd->vmd_free_severe = vmd->vmd_free_min / 2; vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved; vmd->vmd_free_min += vmd->vmd_free_reserved; vmd->vmd_free_severe += vmd->vmd_free_reserved; vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2; if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3) vmd->vmd_inactive_target = vmd->vmd_free_count / 3; /* * Set the default wakeup threshold to be 10% below the paging * target. This keeps the steady state out of shortfall. */ vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9; /* * Target amount of memory to move out of the laundry queue during a * background laundering. This is proportional to the amount of system * memory. */ vmd->vmd_background_launder_target = (vmd->vmd_free_target - vmd->vmd_free_min) / 10; /* Initialize the pageout daemon pid controller. */ pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE, vmd->vmd_free_target, PIDCTRL_BOUND, PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid)); vmd->vmd_inactive_threads = get_pageout_threads_per_domain(vmd); } static void vm_pageout_init(void) { u_long freecount; int i; /* * Initialize some paging parameters. */ if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; freecount = 0; for (i = 0; i < vm_ndomains; i++) { struct vm_domain *vmd; vm_pageout_init_domain(i); vmd = VM_DOMAIN(i); vm_cnt.v_free_reserved += vmd->vmd_free_reserved; vm_cnt.v_free_target += vmd->vmd_free_target; vm_cnt.v_free_min += vmd->vmd_free_min; vm_cnt.v_inactive_target += vmd->vmd_inactive_target; vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min; vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min; vm_cnt.v_free_severe += vmd->vmd_free_severe; freecount += vmd->vmd_free_count; } /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* * Set the maximum number of user-wired virtual pages. Historically the * main source of such pages was mlock(2) and mlockall(2). Hypervisors * may also request user-wired memory. */ if (vm_page_max_user_wired == 0) vm_page_max_user_wired = 4 * freecount / 5; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { struct proc *p; struct thread *td; int error, first, i, j, pageout_threads; p = curproc; td = curthread; mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); for (first = -1, i = 0; i < vm_ndomains; i++) { if (VM_DOMAIN_EMPTY(i)) { if (bootverbose) printf("domain %d empty; skipping pageout\n", i); continue; } if (first == -1) first = i; else { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i); if (error != 0) panic("starting pageout for domain %d: %d\n", i, error); } pageout_threads = VM_DOMAIN(i)->vmd_inactive_threads; for (j = 0; j < pageout_threads - 1; j++) { error = kthread_add(vm_pageout_helper, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d helper%d", i, j); if (error != 0) panic("starting pageout helper %d for domain " "%d: %d\n", j, i, error); } error = kthread_add(vm_pageout_laundry_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i); if (error != 0) panic("starting laundry for domain %d: %d", i, error); } error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); snprintf(td->td_name, sizeof(td->td_name), "dom%d", first); vm_pageout_worker((void *)(uintptr_t)first); } /* * Perform an advisory wakeup of the page daemon. */ void pagedaemon_wakeup(int domain) { struct vm_domain *vmd; vmd = VM_DOMAIN(domain); vm_domain_pageout_assert_unlocked(vmd); if (curproc == pageproc) return; if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) { vm_domain_pageout_lock(vmd); atomic_store_int(&vmd->vmd_pageout_wanted, 1); wakeup(&vmd->vmd_pageout_wanted); vm_domain_pageout_unlock(vmd); } }