Index: head/sys/vm/vm_contig.c =================================================================== --- head/sys/vm/vm_contig.c (revision 121225) +++ head/sys/vm/vm_contig.c (revision 121226) @@ -1,317 +1,324 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vm_contig_launder(int queue) { vm_object_t object; vm_page_t m, m_tmp, next; + struct vnode *vp; for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) { next = TAILQ_NEXT(m, pageq); KASSERT(m->queue == queue, ("vm_contig_launder: page %p's queue is not %d", m, queue)); + if (!VM_OBJECT_TRYLOCK(m->object)) + continue; if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) { + VM_OBJECT_UNLOCK(m->object); vm_page_lock_queues(); return (TRUE); } vm_page_test_dirty(m); if (m->dirty) { object = m->object; if (object->type == OBJT_VNODE) { vm_page_unlock_queues(); - vn_lock(object->handle, - LK_EXCLUSIVE | LK_RETRY, curthread); + vp = object->handle; + VM_OBJECT_UNLOCK(object); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); VM_OBJECT_LOCK(object); vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); - VOP_UNLOCK(object->handle, 0, curthread); + VOP_UNLOCK(vp, 0, curthread); vm_page_lock_queues(); return (TRUE); } else if (object->type == OBJT_SWAP || object->type == OBJT_DEFAULT) { m_tmp = m; - vm_pageout_flush(&m_tmp, 1, 0, FALSE); + vm_pageout_flush(&m_tmp, 1, 0); + VM_OBJECT_UNLOCK(object); return (TRUE); } } else if (m->busy == 0 && m->hold_count == 0) vm_page_cache(m); + VM_OBJECT_UNLOCK(m->object); } return (FALSE); } /* * This interface is for merging with malloc() someday. * Even if we never implement compaction so that contiguous allocation * works after initialization time, malloc()'s data structures are good * for statistics and for allocations of less than a page. */ static void * contigmalloc1( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary, vm_map_t map) { int i, s, start; vm_paddr_t phys; vm_object_t object; vm_offset_t addr, tmp_addr; int pass, pqtype; vm_page_t pga = vm_page_array; size = round_page(size); if (size == 0) panic("contigmalloc1: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("contigmalloc1: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("contigmalloc1: boundary must be a power of 2"); start = 0; for (pass = 0; pass <= 1; pass++) { s = splvm(); vm_page_lock_queues(); again: /* * Find first page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ for (i = start; i < cnt.v_page_count; i++) { phys = VM_PAGE_TO_PHYS(&pga[i]); pqtype = pga[i].queue - pga[i].pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) break; } /* * If the above failed or we will exceed the upper bound, fail. */ if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { again1: if (vm_contig_launder(PQ_INACTIVE)) goto again1; if (vm_contig_launder(PQ_ACTIVE)) goto again1; vm_page_unlock_queues(); splx(s); continue; } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { pqtype = pga[i].queue - pga[i].pc; if ((VM_PAGE_TO_PHYS(&pga[i]) != (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) { start++; goto again; } } for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; if ((m->queue - m->pc) == PQ_CACHE) { object = m->object; if (!VM_OBJECT_TRYLOCK(object)) { start++; goto again; } vm_page_busy(m); vm_page_free(m); VM_OBJECT_UNLOCK(object); } mtx_lock_spin(&vm_page_queue_free_mtx); vm_pageq_remove_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; /* Don't clear the PG_ZERO flag, we'll need it later. */ m->flags &= PG_ZERO; KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m)); m->wire_count = 0; m->busy = 0; m->object = NULL; mtx_unlock_spin(&vm_page_queue_free_mtx); } vm_page_unlock_queues(); /* * We've found a contiguous chunk that meets are requirements. * Allocate kernel VM, unfree and assign the physical pages to * it and return kernel VM pointer. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) != KERN_SUCCESS) { /* * XXX We almost never run out of kernel virtual * space, so we don't make the allocated memory * above available. */ vm_map_unlock(map); splx(s); return (NULL); } vm_object_reference(kernel_object); vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); tmp_addr = addr; VM_OBJECT_LOCK(kernel_object); for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_page_insert(m, kernel_object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) pmap_zero_page(m); m->flags = 0; tmp_addr += PAGE_SIZE; } VM_OBJECT_UNLOCK(kernel_object); vm_map_wire(map, addr, addr + size, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); splx(s); return ((void *)addr); } return (NULL); } void * contigmalloc( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary) { void * ret; mtx_lock(&Giant); ret = contigmalloc1(size, type, flags, low, high, alignment, boundary, kernel_map); mtx_unlock(&Giant); return (ret); } void contigfree(void *addr, unsigned long size, struct malloc_type *type) { GIANT_REQUIRED; kmem_free(kernel_map, (vm_offset_t)addr, size); } vm_offset_t vm_page_alloc_contig( vm_offset_t size, vm_paddr_t low, vm_paddr_t high, vm_offset_t alignment) { vm_offset_t ret; GIANT_REQUIRED; ret = ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high, alignment, 0ul, kernel_map)); return (ret); } Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 121225) +++ head/sys/vm/vm_object.c (revision 121226) @@ -1,2114 +1,2114 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define EASY_SCAN_FACTOR 8 #define MSYNC_FLUSH_HARDSEQ 0x01 #define MSYNC_FLUSH_SOFTSEQ 0x02 /* * msync / VM object flushing optimizations */ static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ; SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags, CTLFLAG_RW, &msync_flush_flags, 0, ""); static void vm_object_qcollapse(vm_object_t object); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static long object_collapses; static long object_bypasses; static int next_index; static uma_zone_t obj_zone; #define VM_OBJECTS_INIT 256 static void vm_object_zinit(void *mem, int size); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); } #endif static void vm_object_zinit(void *mem, int size) { vm_object_t object; object = (vm_object_t)mem; bzero(&object->mtx, sizeof(object->mtx)); VM_OBJECT_LOCK_INIT(object); /* These are true for any object that has been freed */ object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; } void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { int incr; TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->root = NULL; object->type = type; object->size = size; object->generation = 1; object->ref_count = 1; object->flags = 0; if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) object->flags = OBJ_ONEMAPPING; if (size > (PQ_L2_SIZE / 3 + PQ_PRIME1)) incr = PQ_L2_SIZE / 3 + PQ_PRIME1; else incr = size; do object->pg_color = next_index; while (!atomic_cmpset_int(&next_index, object->pg_color, (object->pg_color + incr) & PQ_L2_MASK)); object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); VM_OBJECT_LOCK_INIT(&kernel_object_store); _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); /* * The kmem object's mutex is given a unique name, instead of * "vm object", to avoid false reports of lock-order reversal * with a system map mutex. */ mtx_init(VM_OBJECT_MTX(kmem_object), "kmem object", NULL, MTX_DEF); _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(obj_zone, VM_OBJECTS_INIT); } void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->flags &= ~bits; } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0); } } /* * vm_object_allocate_wait * * Return a new object with the given size, and give the user the * option of waiting for it to complete or failing if the needed * memory isn't available. */ vm_object_t vm_object_allocate_wait(objtype_t type, vm_pindex_t size, int flags) { vm_object_t result; result = (vm_object_t) uma_zalloc(obj_zone, flags); if (result != NULL) _vm_object_allocate(type, size, result); return (result); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { return(vm_object_allocate_wait(type, size, M_WAITOK)); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { struct vnode *vp; int flags; if (object == NULL) return; VM_OBJECT_LOCK(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; VI_LOCK(vp); VM_OBJECT_UNLOCK(object); for (flags = LK_INTERLOCK; vget(vp, flags, curthread); flags = 0) printf("vm_object_reference: delay in vget\n"); } else VM_OBJECT_UNLOCK(object); } /* * Handle deallocating an object of type OBJT_VNODE. */ void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; GIANT_REQUIRED; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vprint("vm_object_vndeallocate", vp); panic("vm_object_vndeallocate: bad object reference count"); } #endif object->ref_count--; if (object->ref_count == 0) { mp_fixme("Unlocked vflag access."); vp->v_vflag &= ~VV_TEXT; } VM_OBJECT_UNLOCK(object); /* * vrele may need a vop lock */ vrele(vp); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; if (object != kmem_object) mtx_lock(&Giant); while (object != NULL) { VM_OBJECT_LOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); goto done; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_UNLOCK(object); goto done; } else if (object->ref_count == 1) { if (object->shadow_count == 0) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); if (!VM_OBJECT_TRYLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_UNLOCK(object); continue; } if ((robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_UNLOCK(object); vm_object_pip_wait(robject, "objde1"); VM_OBJECT_LOCK(object); goto retry; } else if (object->paging_in_progress) { VM_OBJECT_UNLOCK(robject); object->flags |= OBJ_PIPWNT; msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "objde2", 0); VM_OBJECT_LOCK(robject); VM_OBJECT_LOCK(object); goto retry; } VM_OBJECT_UNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_UNLOCK(object); continue; } VM_OBJECT_UNLOCK(robject); } VM_OBJECT_UNLOCK(object); goto done; } doterm: temp = object->backing_object; if (temp != NULL) { VM_OBJECT_LOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; temp->generation++; VM_OBJECT_UNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_UNLOCK(object); object = temp; } done: if (object != kmem_object) mtx_unlock(&Giant); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p; int s; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); VM_OBJECT_LOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Now free any remaining pages. For internal objects, this also * removes them from paging queues. Don't free wired pages, just * remove them from the object. */ s = splvm(); vm_page_lock_queues(); while ((p = TAILQ_FIRST(&object->memq)) != NULL) { KASSERT(!p->busy && (p->flags & PG_BUSY) == 0, ("vm_object_terminate: freeing busy page %p " "p->busy = %d, p->flags %x\n", p, p->busy, p->flags)); if (p->wire_count == 0) { vm_page_busy(p); vm_page_free(p); cnt.v_pfree++; } else { vm_page_busy(p); vm_page_remove(p); } } vm_page_unlock_queues(); splx(s); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_UNLOCK(object); /* * Remove the object from the global object list. */ mtx_lock(&vm_object_list_mtx); TAILQ_REMOVE(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); wakeup(object); /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. */ void vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags) { vm_page_t p, np; vm_pindex_t tstart, tend; vm_pindex_t pi; int clearobjflags; int pagerflags; int curgeneration; GIANT_REQUIRED; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_VNODE || (object->flags & OBJ_MIGHTBEDIRTY) == 0) return; pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; vm_object_set_flag(object, OBJ_CLEANING); tstart = start; if (end == 0) { tend = object->size; } else { tend = end; } vm_page_lock_queues(); /* * If the caller is smart and only msync()s a range he knows is * dirty, we may be able to avoid an object scan. This results in * a phenominal improvement in performance. We cannot do this * as a matter of course because the object may be huge - e.g. * the size might be in the gigabytes or terrabytes. */ if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) { vm_pindex_t tscan; int scanlimit; int scanreset; scanreset = object->resident_page_count / EASY_SCAN_FACTOR; if (scanreset < 16) scanreset = 16; pagerflags |= VM_PAGER_IGNORE_CLEANCHK; scanlimit = scanreset; tscan = tstart; while (tscan < tend) { curgeneration = object->generation; p = vm_page_lookup(object, tscan); if (p == NULL || p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { if (--scanlimit == 0) break; ++tscan; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { if (--scanlimit == 0) break; ++tscan; continue; } /* * If we have been asked to skip nosync pages and * this is a nosync page, we can't continue. */ if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { if (--scanlimit == 0) break; ++tscan; continue; } scanlimit = scanreset; /* * This returns 0 if it was unable to busy the first * page (i.e. had to sleep). */ tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags); } /* * If everything was dirty and we flushed it successfully, * and the requested range is not the entire object, we * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can * return immediately. */ if (tscan >= tend && (tstart || tend < object->size)) { vm_page_unlock_queues(); vm_object_clear_flag(object, OBJ_CLEANING); return; } pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK; } /* * Generally set CLEANCHK interlock and make the page read-only so * we can then clear the object flags. * * However, if this is a nosync mmap then the object is likely to * stay dirty so do not mess with the page and do not clear the * object flags. */ clearobjflags = 1; TAILQ_FOREACH(p, &object->memq, listq) { vm_page_flag_set(p, PG_CLEANCHK); if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) clearobjflags = 0; else pmap_page_protect(p, VM_PROT_READ); } if (clearobjflags && (tstart == 0) && (tend == object->size)) { struct vnode *vp; vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); if (object->type == OBJT_VNODE && (vp = (struct vnode *)object->handle) != NULL) { VI_LOCK(vp); if (vp->v_iflag & VI_OBJDIRTY) vp->v_iflag &= ~VI_OBJDIRTY; VI_UNLOCK(vp); } } rescan: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p; p = np) { int n; np = TAILQ_NEXT(p, listq); again: pi = p->pindex; if (((p->flags & PG_CLEANCHK) == 0) || (pi < tstart) || (pi >= tend) || (p->valid == 0) || ((p->queue - p->pc) == PQ_CACHE)) { vm_page_flag_clear(p, PG_CLEANCHK); continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { vm_page_flag_clear(p, PG_CLEANCHK); continue; } /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were * not cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { vm_page_flag_clear(p, PG_CLEANCHK); continue; } n = vm_object_page_collect_flush(object, p, curgeneration, pagerflags); if (n == 0) goto rescan; if (object->generation != curgeneration) goto rescan; /* * Try to optimize the next page. If we can't we pick up * our (random) scan where we left off. */ if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) { if ((p = vm_page_lookup(object, pi + n)) != NULL) goto again; } } vm_page_unlock_queues(); #if 0 VOP_FSYNC(vp, NULL, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc); #endif vm_object_clear_flag(object, OBJ_CLEANING); return; } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags) { int runlen; int s; int maxf; int chkb; int maxb; int i; vm_pindex_t pi; vm_page_t maf[vm_pageout_page_count]; vm_page_t mab[vm_pageout_page_count]; vm_page_t ma[vm_pageout_page_count]; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); pi = p->pindex; while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) { vm_page_lock_queues(); if (object->generation != curgeneration) { splx(s); return(0); } } maxf = 0; for(i = 1; i < vm_pageout_page_count; i++) { vm_page_t tp; if ((tp = vm_page_lookup(object, pi + i)) != NULL) { if ((tp->flags & PG_BUSY) || ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && (tp->flags & PG_CLEANCHK) == 0) || (tp->busy != 0)) break; if((tp->queue - tp->pc) == PQ_CACHE) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } maf[ i - 1 ] = tp; maxf++; continue; } break; } maxb = 0; chkb = vm_pageout_page_count - maxf; if (chkb) { for(i = 1; i < chkb;i++) { vm_page_t tp; if ((tp = vm_page_lookup(object, pi - i)) != NULL) { if ((tp->flags & PG_BUSY) || ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && (tp->flags & PG_CLEANCHK) == 0) || (tp->busy != 0)) break; if ((tp->queue - tp->pc) == PQ_CACHE) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } mab[ i - 1 ] = tp; maxb++; continue; } break; } } for(i = 0; i < maxb; i++) { int index = (maxb - i) - 1; ma[index] = mab[i]; vm_page_flag_clear(ma[index], PG_CLEANCHK); } vm_page_flag_clear(p, PG_CLEANCHK); ma[maxb] = p; for(i = 0; i < maxf; i++) { int index = (maxb + i) + 1; ma[index] = maf[i]; vm_page_flag_clear(ma[index], PG_CLEANCHK); } runlen = maxb + maxf + 1; splx(s); - vm_pageout_flush(ma, runlen, pagerflags, TRUE); + vm_pageout_flush(ma, runlen, pagerflags); for (i = 0; i < runlen; i++) { if (ma[i]->valid & ma[i]->dirty) { pmap_page_protect(ma[i], VM_PROT_READ); vm_page_flag_set(ma[i], PG_CLEANCHK); /* * maxf will end up being the actual number of pages * we wrote out contiguously, non-inclusive of the * first page. We do not count look-behind pages. */ if (i >= maxb + 1 && (maxf > i - maxb - 1)) maxf = i - maxb - 1; } } return(maxf + 1); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise) { vm_pindex_t end, tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; mtx_lock(&Giant); end = pindex + count; /* * Locate and adjust resident pages */ for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; VM_OBJECT_LOCK(tobject); shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ if (advise == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } m = vm_page_lookup(tobject, tpindex); if (m == NULL) { /* * There may be swap even if there is no backing page */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* * next object */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_LOCK(backing_object); VM_OBJECT_UNLOCK(tobject); tobject = backing_object; tpindex += OFF_TO_IDX(tobject->backing_object_offset); goto shadowlookup; } /* * If the page is busy or not in a normal active state, * we skip it. If the page is not managed there are no * page queues to mess with. Things can break if we mess * with pages in any of the below states. */ vm_page_lock_queues(); if (m->hold_count || m->wire_count || (m->flags & PG_UNMANAGED) || m->valid != VM_PAGE_BITS_ALL) { vm_page_unlock_queues(); goto unlock_tobject; } if (vm_page_sleep_if_busy(m, TRUE, "madvpo")) { VM_OBJECT_UNLOCK(tobject); goto relookup; } if (advise == MADV_WILLNEED) { vm_page_activate(m); } else if (advise == MADV_DONTNEED) { vm_page_dontneed(m); } else if (advise == MADV_FREE) { /* * Mark the page clean. This will allow the page * to be freed up by the system. However, such pages * are often reused quickly by malloc()/free() * so we do not do anything that would cause * a page fault if we can help it. * * Specifically, we do not try to actually free * the page now nor do we try to put it in the * cache (which would cause a page fault on reuse). * * But we do make the page is freeable as we * can without actually taking the step of unmapping * it. */ pmap_clear_modify(m); m->dirty = 0; m->act_count = 0; vm_page_dontneed(m); } vm_page_unlock_queues(); if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: VM_OBJECT_UNLOCK(tobject); } mtx_unlock(&Giant); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_LOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_UNLOCK(source); return; } VM_OBJECT_UNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, length); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; if (source != NULL) { VM_OBJECT_LOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; source->generation++; if (length < source->size) length = source->size; if (length > PQ_L2_SIZE / 3 + PQ_PRIME1 || source->generation > 1) length = PQ_L2_SIZE / 3 + PQ_PRIME1; result->pg_color = (source->pg_color + length * source->generation) & PQ_L2_MASK; VM_OBJECT_UNLOCK(source); next_index = (result->pg_color + PQ_L2_SIZE / 3 + PQ_PRIME1) & PQ_L2_MASK; } /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m; vm_object_t orig_object, new_object, source; vm_offset_t s, e; vm_pindex_t offidxstart, offidxend; vm_size_t idx, size; vm_ooffset_t offset; GIANT_REQUIRED; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; offset = entry->offset; s = entry->start; e = entry->end; offidxstart = OFF_TO_IDX(offset); offidxend = offidxstart + OFF_TO_IDX(e - s); size = offidxend - offidxstart; new_object = vm_pager_allocate(orig_object->type, NULL, IDX_TO_OFF(size), VM_PROT_ALL, 0LL); if (new_object == NULL) return; source = orig_object->backing_object; if (source != NULL) { vm_object_reference(source); /* Referenced by new_object */ VM_OBJECT_LOCK(source); LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; source->generation++; vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + offset; new_object->backing_object = source; } VM_OBJECT_LOCK(orig_object); for (idx = 0; idx < size; idx++) { retry: m = vm_page_lookup(orig_object, offidxstart + idx); if (m == NULL) continue; /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, TRUE, "spltwt")) goto retry; vm_page_busy(m); vm_page_rename(m, new_object, idx); /* page automatically made dirty by rename and cache handled */ vm_page_busy(m); vm_page_unlock_queues(); } if (orig_object->type == OBJT_SWAP) { vm_object_pip_add(orig_object, 1); VM_OBJECT_UNLOCK(orig_object); /* * copy orig_object pages into new_object * and destroy unneeded pages in * shadow object. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); VM_OBJECT_LOCK(orig_object); vm_object_pip_wakeup(orig_object); } VM_OBJECT_UNLOCK(orig_object); vm_page_lock_queues(); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_wakeup(m); vm_page_unlock_queues(); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); } #define OBSC_TEST_ALL_SHADOWED 0x0001 #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static int vm_object_backing_scan(vm_object_t object, int op) { int s; int r = 1; vm_page_t p; vm_object_t backing_object; vm_pindex_t backing_offset_index; s = splvm(); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if (op & OBSC_TEST_ALL_SHADOWED) { /* * We do not want to have to test for the existence of * swap pages in the backing object. XXX but with the * new swapper this would be pretty easy to do. * * XXX what about anonymous MAP_SHARED memory that hasn't * been ZFOD faulted yet? If we do not test for this, the * shadow test may succeed! XXX */ if (backing_object->type != OBJT_DEFAULT) { splx(s); return (0); } } if (op & OBSC_COLLAPSE_WAIT) { vm_object_set_flag(backing_object, OBJ_DEAD); } /* * Our scan */ p = TAILQ_FIRST(&backing_object->memq); while (p) { vm_page_t next = TAILQ_NEXT(p, listq); vm_pindex_t new_pindex = p->pindex - backing_offset_index; if (op & OBSC_TEST_ALL_SHADOWED) { vm_page_t pp; /* * Ignore pages outside the parent object's range * and outside the parent object's mapping of the * backing object. * * note that we do not busy the backing object's * page. */ if ( p->pindex < backing_offset_index || new_pindex >= object->size ) { p = next; continue; } /* * See if the parent has the page or if the parent's * object pager has the page. If the parent has the * page but the page is not valid, the parent's * object pager must have the page. * * If this fails, the parent does not completely shadow * the object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ( (pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL) ) { r = 0; break; } } /* * Check for busy page */ if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) { vm_page_t pp; vm_page_lock_queues(); if (op & OBSC_COLLAPSE_NOWAIT) { if ((p->flags & PG_BUSY) || !p->valid || p->hold_count || p->wire_count || p->busy) { vm_page_unlock_queues(); p = next; continue; } } else if (op & OBSC_COLLAPSE_WAIT) { if ((p->flags & PG_BUSY) || p->busy) { vm_page_flag_set(p, PG_WANTED | PG_REFERENCED); VM_OBJECT_UNLOCK(backing_object); VM_OBJECT_UNLOCK(object); msleep(p, &vm_page_queue_mtx, PDROP | PVM, "vmocol", 0); VM_OBJECT_LOCK(object); VM_OBJECT_LOCK(backing_object); /* * If we slept, anything could have * happened. Since the object is * marked dead, the backing offset * should not have changed so we * just restart our scan. */ p = TAILQ_FIRST(&backing_object->memq); continue; } } /* * Busy the page */ vm_page_busy(p); vm_page_unlock_queues(); KASSERT( p->object == backing_object, ("vm_object_qcollapse(): object mismatch") ); /* * Destroy any associated swap */ if (backing_object->type == OBJT_SWAP) { swap_pager_freespace( backing_object, p->pindex, 1 ); } if ( p->pindex < backing_offset_index || new_pindex >= object->size ) { /* * Page is out of the parent object's range, we * can simply destroy it. */ vm_page_lock_queues(); pmap_remove_all(p); vm_page_free(p); vm_page_unlock_queues(); p = next; continue; } pp = vm_page_lookup(object, new_pindex); if ( pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL) ) { /* * page already exists in parent OR swap exists * for this location in the parent. Destroy * the original page from the backing object. * * Leave the parent's page alone */ vm_page_lock_queues(); pmap_remove_all(p); vm_page_free(p); vm_page_unlock_queues(); p = next; continue; } /* * Page does not exist in parent, rename the * page from the backing object to the main object. * * If the page was mapped to a process, it can remain * mapped through the rename. */ vm_page_lock_queues(); vm_page_rename(p, object, new_pindex); vm_page_unlock_queues(); /* page automatically made dirty by rename */ } p = next; } splx(s); return (r); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED); if (backing_object->ref_count != 1) return; backing_object->ref_count += 2; vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT); backing_object->ref_count -= 2; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); while (TRUE) { vm_object_t backing_object; /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_LOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(backing_object); break; } if ( object->paging_in_progress != 0 || backing_object->paging_in_progress != 0 ) { vm_object_qcollapse(object); VM_OBJECT_UNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_backing_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT); /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { vm_object_pip_add(backing_object, 1); VM_OBJECT_UNLOCK(backing_object); /* * scrap the paging_offset junk and do a * discrete copy. This also removes major * assumptions about how the swap-pager * works from where it doesn't belong. The * new swapper is able to optimize the * destroy-source case. */ vm_object_pip_add(object, 1); VM_OBJECT_UNLOCK(object); swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); VM_OBJECT_LOCK(object); vm_object_pip_wakeup(object); VM_OBJECT_LOCK(backing_object); vm_object_pip_wakeup(backing_object); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; backing_object->generation++; if (backing_object->backing_object) { VM_OBJECT_LOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); backing_object->backing_object->shadow_count--; backing_object->backing_object->generation++; VM_OBJECT_UNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; if (object->backing_object) { VM_OBJECT_LOCK(object->backing_object); LIST_INSERT_HEAD( &object->backing_object->shadow_head, object, shadow_list ); object->backing_object->shadow_count++; object->backing_object->generation++; VM_OBJECT_UNLOCK(object->backing_object); } object->backing_object_offset += backing_object->backing_object_offset; /* XXX */ VM_OBJECT_UNLOCK(object); /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object)); KASSERT(TAILQ_FIRST(&backing_object->memq) == NULL, ("backing_object %p somehow has left over pages during collapse!", backing_object)); VM_OBJECT_UNLOCK(backing_object); mtx_lock(&vm_object_list_mtx); TAILQ_REMOVE( &vm_object_list, backing_object, object_list ); mtx_unlock(&vm_object_list_mtx); uma_zfree(obj_zone, backing_object); object_collapses++; } else { vm_object_t new_backing_object; /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (vm_object_backing_scan(object, OBSC_TEST_ALL_SHADOWED) == 0) { VM_OBJECT_UNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; backing_object->generation++; VM_OBJECT_UNLOCK(backing_object); /* XXX */ VM_OBJECT_UNLOCK(object); new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { vm_object_reference(new_backing_object); VM_OBJECT_LOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; new_backing_object->generation++; VM_OBJECT_UNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish; * so we don't need to call vm_object_deallocate, but * we do anyway. */ vm_object_deallocate(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ /* XXX */ VM_OBJECT_LOCK(object); } } /* * vm_object_page_remove: * * Removes all physical pages in the given range from the * object's list of pages. If the range's end is zero, all * physical pages from the range's start to the end of the object * are deleted. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, boolean_t clean_only) { vm_page_t p, next; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->resident_page_count == 0) return; /* * Since physically-backed objects do not use managed pages, we can't * remove pages from the object (we must instead remove the page * references, and then destroy the object). */ KASSERT(object->type != OBJT_PHYS, ("attempt to remove pages from a physical object")); vm_object_pip_add(object, 1); again: vm_page_lock_queues(); if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < start) { p = vm_page_splay(start, object->root); if ((object->root = p)->pindex < start) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); if (p->wire_count != 0) { pmap_remove_all(p); if (!clean_only) p->valid = 0; continue; } if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) goto again; if (clean_only && p->valid) { vm_page_test_dirty(p); if (p->valid & p->dirty) continue; } vm_page_busy(p); pmap_remove_all(p); vm_page_free(p); } vm_page_unlock_queues(); vm_object_pip_wakeup(object); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * next_object Second object into coalesce * next_offset Offset into next_object * * prev_size Size of reference to prev_object * next_size Size of reference to next_object * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, vm_size_t prev_size, vm_size_t next_size) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); mtx_lock(&Giant); VM_OBJECT_LOCK(prev_object); if (prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) { VM_OBJECT_UNLOCK(prev_object); mtx_unlock(&Giant); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_UNLOCK(prev_object); mtx_unlock(&Giant); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = prev_pindex + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_UNLOCK(prev_object); mtx_unlock(&Giant); return (FALSE); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, FALSE); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_UNLOCK(prev_object); mtx_unlock(&Giant); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { struct vnode *vp; vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); if (object->type == OBJT_VNODE && (vp = (struct vnode *)object->handle) != NULL) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) == 0) vp->v_iflag |= VI_OBJDIRTY; VI_UNLOCK(vp); } } #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ LIST_FOREACH(p, &allproc, p_list) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; if (_vm_object_in_map(kmem_map, object, 0)) return 1; if (_vm_object_in_map(pager_map, object, 0)) return 1; if (_vm_object_in_map(buffer_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; int nl = 0; int c; TAILQ_FOREACH(object, &vm_object_list, object_list) { vm_pindex_t idx, fidx; vm_pindex_t osize; vm_paddr_t pa = -1, padiff; int rcount; vm_page_t m; db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; osize = object->size; if (osize > 128) osize = 128; for (idx = 0; idx < osize; idx++) { m = vm_page_lookup(object, idx); if (m == NULL) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } continue; } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); padiff >>= PAGE_SHIFT; padiff &= PQ_L2_MASK; if (padiff == 0) { pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; ++rcount; continue; } db_printf(" index(%ld)run(%d)pa(0x%lx)", (long)fidx, rcount, (long)pa); db_printf("pd(%ld)\n", (long)padiff); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = idx; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 121225) +++ head/sys/vm/vm_pageout.c (revision 121226) @@ -1,1600 +1,1594 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_page_free(vm_page_t); static void vm_pageout_pmap_collect(void); static void vm_pageout_scan(int pass); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) #endif int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; #endif static int vm_max_launder = 32; static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; static int vm_pageout_full_stats_interval = 0; static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; static int defer_swap_pageouts=0; static int disable_swap_pageouts=0; #if defined(NO_SWAPPING) static int vm_swap_enabled=0; static int vm_swap_idle_enabled=0; #else static int vm_swap_enabled=1; static int vm_swap_idle_enabled=0; #endif SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); SYSCTL_INT(_vm, OID_AUTO, max_launder, CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, ""); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(void); #endif static void vm_pageout_page_stats(void); /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. Note the careful timing, however, the busy bit isn't set till * late and we cannot do anything that will mess with the page. */ static int vm_pageout_clean(m) vm_page_t m; { vm_object_t object; vm_page_t mc[2*vm_pageout_page_count]; int pageout_count; int ib, is, page_base; vm_pindex_t pindex = m->pindex; mtx_assert(&vm_page_queue_mtx, MA_OWNED); VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); /* * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP * with the new swapper, but we could have serious problems paging * out other object types if there is insufficient memory. * * Unfortunately, checking free memory here is far too late, so the * check has been moved up a procedural level. */ /* * Don't mess with the page if it's busy, held, or special */ if ((m->hold_count != 0) || ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) { return 0; } mc[vm_pageout_page_count] = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying * align the clusters (which leave sporatic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ object = m->object; more: while (ib && pageout_count < vm_pageout_page_count) { vm_page_t p; if (ib > pindex) { ib = 0; break; } if ((p = vm_page_lookup(object, pindex - ib)) == NULL) { ib = 0; break; } if (((p->queue - p->pc) == PQ_CACHE) || (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { ib = 0; break; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0 || p->queue != PQ_INACTIVE || p->wire_count != 0 || /* may be held by buf cache */ p->hold_count != 0) { /* may be undergoing I/O */ ib = 0; break; } mc[--page_base] = p; ++pageout_count; ++ib; /* * alignment boundry, stop here and switch directions. Do * not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { vm_page_t p; if ((p = vm_page_lookup(object, pindex + is)) == NULL) break; if (((p->queue - p->pc) == PQ_CACHE) || (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { break; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0 || p->queue != PQ_INACTIVE || p->wire_count != 0 || /* may be held by buf cache */ p->hold_count != 0) { /* may be undergoing I/O */ break; } mc[page_base + pageout_count] = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past a page boundry. This catches boundry * conditions. */ if (ib && pageout_count < vm_pageout_page_count) goto more; /* * we allow reads during pageouts... */ - return (vm_pageout_flush(&mc[page_base], pageout_count, 0, TRUE)); + return (vm_pageout_flush(&mc[page_base], pageout_count, 0)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. */ int -vm_pageout_flush(mc, count, flags, is_object_locked) - vm_page_t *mc; - int count; - int flags; - int is_object_locked; +vm_pageout_flush(vm_page_t *mc, int count, int flags) { vm_object_t object; int pageout_status[count]; int numpagedout = 0; int i; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { - KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count)); + KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, + ("vm_pageout_flush: partially invalid page %p index %d/%d", + mc[i], i, count)); vm_page_io_start(mc[i]); pmap_page_protect(mc[i], VM_PROT_READ); } object = mc[0]->object; vm_page_unlock_queues(); - if (!is_object_locked) - VM_OBJECT_LOCK(object); vm_object_pip_add(object, count); VM_OBJECT_UNLOCK(object); vm_pager_put_pages(object, mc, count, (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)), pageout_status); VM_OBJECT_LOCK(object); vm_page_lock_queues(); for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; switch (pageout_status[i]) { case VM_PAGER_OK: case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ pmap_clear_modify(mt); vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ vm_page_activate(mt); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_io_finish(mt); if (!vm_page_count_severe() || !vm_page_try_to_cache(mt)) pmap_page_protect(mt, VM_PROT_READ); } } - if (!is_object_locked) - VM_OBJECT_UNLOCK(object); return numpagedout; } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * backing_objects. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap, first_object, desired) pmap_t pmap; vm_object_t first_object; long desired; { vm_object_t backing_object, object; vm_page_t p, next; int actcount, rcount, remove_mode; VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED); if (first_object->type == OBJT_DEVICE || first_object->type == OBJT_PHYS) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (object->paging_in_progress) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = TAILQ_FIRST(&object->memq); vm_page_lock_queues(); while (p && (rcount-- > 0)) { if (pmap_resident_count(pmap) <= desired) { vm_page_unlock_queues(); goto unlock_return; } next = TAILQ_NEXT(p, listq); cnt.v_pdpages++; if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 || (p->flags & (PG_BUSY|PG_UNMANAGED)) || !pmap_page_exists_quick(pmap, p)) { p = next; continue; } actcount = pmap_ts_referenced(p); if (actcount) { vm_page_flag_set(p, PG_REFERENCED); } else if (p->flags & PG_REFERENCED) { actcount = 1; } if ((p->queue != PQ_ACTIVE) && (p->flags & PG_REFERENCED)) { vm_page_activate(p); p->act_count += actcount; vm_page_flag_clear(p, PG_REFERENCED); } else if (p->queue == PQ_ACTIVE) { if ((p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) { pmap_remove_all(p); vm_page_deactivate(p); } else { vm_pageq_requeue(p); } } else { vm_page_activate(p); vm_page_flag_clear(p, PG_REFERENCED); if (p->act_count < (ACT_MAX - ACT_ADVANCE)) p->act_count += ACT_ADVANCE; vm_pageq_requeue(p); } } else if (p->queue == PQ_INACTIVE) { pmap_remove_all(p); } p = next; } vm_page_unlock_queues(); if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_LOCK(backing_object); if (object != first_object) VM_OBJECT_UNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_UNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_UNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_UNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_UNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_UNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { GIANT_REQUIRED; vm_page_lock_queues(); pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); vm_page_unlock_queues(); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * Warning! The page queue lock is released and reacquired. */ static void vm_pageout_page_free(vm_page_t m) { vm_object_t object = m->object; mtx_assert(&vm_page_queue_mtx, MA_OWNED); vm_page_busy(m); vm_page_unlock_queues(); /* * Avoid a lock order reversal. The page must be busy. */ VM_OBJECT_LOCK(object); vm_page_lock_queues(); pmap_remove_all(m); vm_page_free(m); VM_OBJECT_UNLOCK(object); cnt.v_dfree++; } /* * This routine is very drastic, but can save the system * in a pinch. */ static void vm_pageout_pmap_collect(void) { int i; vm_page_t m; static int warningdone; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } vm_page_lock_queues(); for (i = 0; i < vm_page_array_size; i++) { m = &vm_page_array[i]; if (m->wire_count || m->hold_count || m->busy || (m->flags & (PG_BUSY | PG_UNMANAGED))) continue; pmap_remove_all(m); } vm_page_unlock_queues(); pmap_pagedaemon_waken = 0; } /* * vm_pageout_scan does the dirty work for the pageout daemon. */ static void vm_pageout_scan(int pass) { vm_page_t m, next; struct vm_page marker; int page_shortage, maxscan, pcount; int addl_page_shortage, addl_page_shortage_init; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; int actcount; int vnodes_skipped = 0; int maxlaunder; int s; struct thread *td; GIANT_REQUIRED; /* * Decrease registered cache sizes. */ EVENTHANDLER_INVOKE(vm_lowmem, 0); /* * We do this explicitly after the caches have been drained above. */ uma_reclaim(); /* * Do whatever cleanup that the pmap code can. */ vm_pageout_pmap_collect(); addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit); /* * Calculate the number of pages we want to either free or move * to the cache. */ page_shortage = vm_paging_target() + addl_page_shortage_init; /* * Initialize our marker */ bzero(&marker, sizeof(marker)); marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; marker.queue = PQ_INACTIVE; marker.wire_count = 1; /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or * we have scanned the entire inactive queue. Note that m->act_count * is not used to form decisions for the inactive queue, only for the * active queue. * * maxlaunder limits the number of dirty pages we flush per scan. * For most systems a smaller value (16 or 32) is more robust under * extreme memory and disk pressure because any unnecessary writes * to disk can result in extreme performance degredation. However, * systems with excessive dirty pages (especially when MAP_NOSYNC is * used) will die horribly with limited laundering. If the pageout * daemon cannot clean enough pages in the first pass, we let it go * all out in succeeding passes. */ if ((maxlaunder = vm_max_launder) <= 1) maxlaunder = 1; if (pass) maxlaunder = 10000; vm_page_lock_queues(); rescan0: addl_page_shortage = addl_page_shortage_init; maxscan = cnt.v_inactive_count; for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { cnt.v_pdpages++; if (m->queue != PQ_INACTIVE) { goto rescan0; } next = TAILQ_NEXT(m, pageq); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; /* * A held page may be undergoing I/O, so skip it. */ if (m->hold_count) { vm_pageq_requeue(m); addl_page_shortage++; continue; } /* * Don't mess with busy pages, keep in the front of the * queue, most likely are being paged out. */ if (m->busy || (m->flags & PG_BUSY)) { addl_page_shortage++; continue; } /* * If the object is not being used, we ignore previous * references. */ if (m->object->ref_count == 0) { vm_page_flag_clear(m, PG_REFERENCED); pmap_clear_reference(m); /* * Otherwise, if the page has been referenced while in the * inactive queue, we bump the "activation count" upwards, * making it less likely that the page will be added back to * the inactive queue prematurely again. Here we check the * page tables (or emulated bits, if any), given the upper * level VM system not knowing anything about existing * references. */ } else if (((m->flags & PG_REFERENCED) == 0) && (actcount = pmap_ts_referenced(m))) { vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE); continue; } /* * If the upper level VM system knows about any page * references, we activate the page. We also set the * "activation count" higher than normal so that we will less * likely place pages back onto the inactive queue again. */ if ((m->flags & PG_REFERENCED) != 0) { vm_page_flag_clear(m, PG_REFERENCED); actcount = pmap_ts_referenced(m); vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE + 1); continue; } /* * If the upper level VM system doesn't know anything about * the page being dirty, we have to check for it again. As * far as the VM code knows, any partially dirty pages are * fully dirty. */ if (m->dirty == 0) { vm_page_test_dirty(m); } else { vm_page_dirty(m); } object = m->object; if (!VM_OBJECT_TRYLOCK(object)) continue; if (m->valid == 0) { /* * Invalid pages can be easily freed */ vm_page_busy(m); pmap_remove_all(m); vm_page_free(m); cnt.v_dfree++; --page_shortage; } else if (m->dirty == 0) { /* * Clean pages can be placed onto the cache queue. * This effectively frees them. */ vm_page_cache(m); --page_shortage; } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { /* * Dirty pages need to be paged out, but flushing * a page is extremely expensive verses freeing * a clean page. Rather then artificially limiting * the number of pages we can flush, we instead give * dirty pages extra priority on the inactive queue * by forcing them to be cycled through the queue * twice before being flushed, after which the * (now clean) page will cycle through once more * before being freed. This significantly extends * the thrash point for a heavily loaded machine. */ vm_page_flag_set(m, PG_WINATCFLS); vm_pageq_requeue(m); } else if (maxlaunder > 0) { /* * We always want to try to flush some dirty pages if * we encounter them, to keep the system stable. * Normally this number is small, but under extreme * pressure where there are insufficient clean pages * on the inactive queue, we may have to go all out. */ int swap_pageouts_ok; struct vnode *vp = NULL; struct mount *mp; if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { swap_pageouts_ok = 1; } else { swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && vm_page_count_min()); } /* * We don't bother paging objects that are "dead". * Those objects are in a "rundown" state. */ if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(object); vm_pageq_requeue(m); continue; } /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * The previous code skipped locked vnodes and, worse, * reordered pages in the queue. This results in * completely non-deterministic operation and, on a * busy system, can lead to extremely non-optimal * pageouts. For example, it can cause clean pages * to be freed and dirty pages to be moved to the end * of the queue. Since dirty pages are also moved to * the end of the queue once-cleaned, this gives * way too large a weighting to defering the freeing * of dirty pages. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vp = object->handle; mp = NULL; if (vp->v_type == VREG) vn_start_write(vp, &mp, V_NOWAIT); vm_page_unlock_queues(); VI_LOCK(vp); VM_OBJECT_UNLOCK(object); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_TIMELOCK, curthread)) { VM_OBJECT_LOCK(object); vm_page_lock_queues(); ++pageout_lock_miss; vn_finished_write(mp); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; VM_OBJECT_UNLOCK(object); continue; } VM_OBJECT_LOCK(object); vm_page_lock_queues(); /* * The page might have been moved to another * queue during potential blocking in vget() * above. The page might have been freed and * reused for another vnode. The object might * have been reused for another vnode. */ if (m->queue != PQ_INACTIVE || m->object != object || object->handle != vp) { if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; goto unlock_and_continue; } /* * The page may have been busied during the * blocking in vput(); We don't move the * page back onto the end of the queue so that * statistics are more correct if we don't. */ if (m->busy || (m->flags & PG_BUSY)) { goto unlock_and_continue; } /* * If the page has become held it might * be undergoing I/O, so skip it */ if (m->hold_count) { vm_pageq_requeue(m); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; goto unlock_and_continue; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. * * This operation may cluster, invalidating the 'next' * pointer. To prevent an inordinate number of * restarts we use our marker to remember our place. * * decrement page_shortage on success to account for * the (future) cleaned page. Otherwise we could wind * up laundering or cleaning too many pages. */ s = splvm(); TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq); splx(s); if (vm_pageout_clean(m) != 0) { --page_shortage; --maxlaunder; } s = splvm(); next = TAILQ_NEXT(&marker, pageq); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq); splx(s); unlock_and_continue: VM_OBJECT_UNLOCK(object); if (vp) { vm_page_unlock_queues(); vput(vp); vn_finished_write(mp); vm_page_lock_queues(); } continue; } VM_OBJECT_UNLOCK(object); } /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ page_shortage = vm_paging_target() + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; /* * Scan the active queue for things we can deactivate. We nominally * track the per-page activity counter and use it to locate * deactivation candidates. */ pcount = cnt.v_active_count; m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { /* * This is a consistency check, and should likely be a panic * or warning. */ if (m->queue != PQ_ACTIVE) { break; } next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { vm_pageq_requeue(m); m = next; continue; } /* * The count for pagedaemon pages is done after checking the * page for eligibility... */ cnt.v_pdpages++; /* * Check to see "how much" the page has been used. */ actcount = 0; if (m->object->ref_count != 0) { if (m->flags & PG_REFERENCED) { actcount += 1; } actcount += pmap_ts_referenced(m); if (actcount) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } } /* * Since we have "tested" this bit, we need to clear it now. */ vm_page_flag_clear(m, PG_REFERENCED); /* * Only if an object is currently being used, do we use the * page activation count stats. */ if (actcount && (m->object->ref_count != 0)) { vm_pageq_requeue(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); if (vm_pageout_algorithm || m->object->ref_count == 0 || m->act_count == 0) { page_shortage--; if (m->object->ref_count == 0) { pmap_remove_all(m); if (m->dirty == 0) vm_page_cache(m); else vm_page_deactivate(m); } else { vm_page_deactivate(m); } } else { vm_pageq_requeue(m); } } m = next; } s = splvm(); /* * We try to maintain some *really* free pages, this allows interrupt * code to be guaranteed space. Since both cache and free queues * are considered basically 'free', moving pages from cache to free * does not effect other calculations. */ while (cnt.v_free_count < cnt.v_free_reserved) { static int cache_rover = 0; m = vm_pageq_find(PQ_CACHE, cache_rover, FALSE); if (!m) break; if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->hold_count || m->wire_count) { #ifdef INVARIANTS printf("Warning: busy page %p found in cache\n", m); #endif vm_page_deactivate(m); continue; } cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK; vm_pageout_page_free(m); } splx(s); vm_page_unlock_queues(); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second. */ if (vm_swap_idle_enabled) { static long lsec; if (time_second != lsec) { vm_pageout_req_swapout |= VM_SWAP_IDLE; vm_req_vmdaemon(); lsec = time_second; } } #endif /* * If we didn't get enough free pages, and we have skipped a vnode * in a writeable object, wakeup the sync daemon. And kick swapout * if we did not get enough free pages. */ if (vm_paging_target() > 0) { if (vnodes_skipped && vm_page_count_min()) (void) speedup_syncer(); #if !defined(NO_SWAPPING) if (vm_swap_enabled && vm_page_count_target()) { vm_req_vmdaemon(); vm_pageout_req_swapout |= VM_SWAP_NORMAL; } #endif } /* * If we are critically low on one of RAM or swap and low on * the other, kill the largest process. However, we avoid * doing this on the first pass in order to give ourselves a * chance to flush out dirty vnode-backed pages and to allow * active pages to be moved to the inactive queue and reclaimed. * * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of it's child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ if (pass != 0 && ((swap_pager_avail < 64 && vm_page_count_min()) || (swap_pager_full && vm_paging_target() > 0))) { bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; /* * If this process is already locked, skip it. */ if (PROC_TRYLOCK(p) == 0) continue; /* * If this is a system or protected process, skip it. */ if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) || (p->p_flag & P_PROTECTED) || ((p->p_pid < 48) && (swap_pager_avail != 0))) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. Check all the threads individually. */ mtx_lock_spin(&sched_lock); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { breakout = 1; break; } } if (breakout) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); continue; } mtx_unlock_spin(&sched_lock); /* * get the process size */ if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) { PROC_UNLOCK(p); continue; } size = vmspace_swap_count(p->p_vmspace); vm_map_unlock_read(&p->p_vmspace->vm_map); size += vmspace_resident_count(p->p_vmspace); /* * if the this process is bigger than the biggest one * remember it. */ if (size > bigsize) { if (bigproc != NULL) PROC_UNLOCK(bigproc); bigproc = p; bigsize = size; } else PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); if (bigproc != NULL) { struct ksegrp *kg; killproc(bigproc, "out of swap space"); mtx_lock_spin(&sched_lock); FOREACH_KSEGRP_IN_PROC(bigproc, kg) { sched_nice(kg, PRIO_MIN); /* XXXKSE ??? */ } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(bigproc); wakeup(&cnt.v_free_count); } } } /* * This routine tries to maintain the pseudo LRU active queue, * so that during long periods of time where there is no paging, * that some statistic accumulation still occurs. This code * helps the situation where paging just starts to occur. */ static void vm_pageout_page_stats() { vm_page_t m,next; int pcount,tpcount; /* Number of pages to check */ static int fullintervalcount = 0; int page_shortage; int s0; page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); if (page_shortage <= 0) return; s0 = splvm(); vm_page_lock_queues(); pcount = cnt.v_active_count; fullintervalcount += vm_pageout_stats_interval; if (fullintervalcount < vm_pageout_full_stats_interval) { tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count; if (pcount > tpcount) pcount = tpcount; } else { fullintervalcount = 0; } m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl); while ((m != NULL) && (pcount-- > 0)) { int actcount; if (m->queue != PQ_ACTIVE) { break; } next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { vm_pageq_requeue(m); m = next; continue; } actcount = 0; if (m->flags & PG_REFERENCED) { vm_page_flag_clear(m, PG_REFERENCED); actcount += 1; } actcount += pmap_ts_referenced(m); if (actcount) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; vm_pageq_requeue(m); } else { if (m->act_count == 0) { /* * We turn off page access, so that we have * more accurate RSS stats. We don't do this * in the normal page deactivation when the * system is loaded VM wise, because the * cost of the large number of page protect * operations would be higher than the value * of doing the operation. */ pmap_remove_all(m); vm_page_deactivate(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); vm_pageq_requeue(m); } } m = next; } vm_page_unlock_queues(); splx(s0); } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout() { int error, pass, s; mtx_lock(&Giant); /* * Initialize some paging parameters. */ cnt.v_interrupt_free_min = 2; if (cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (cnt.v_page_count > 1024) cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; else cnt.v_free_min = 4; cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + cnt.v_interrupt_free_min; cnt.v_free_reserved = vm_pageout_page_count + cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_L2_SIZE; cnt.v_free_severe = cnt.v_free_min / 2; cnt.v_free_min += cnt.v_free_reserved; cnt.v_free_severe += cnt.v_free_reserved; /* * v_free_target and v_cache_min control pageout hysteresis. Note * that these are more a measure of the VM cache queue hysteresis * then the VM free queue. Specifically, v_free_target is the * high water mark (free+cache pages). * * v_free_reserved + v_cache_min (mostly means v_cache_min) is the * low water mark, while v_free_min is the stop. v_cache_min must * be big enough to handle memory needs while the pageout daemon * is signalled and run to free more pages. */ if (cnt.v_free_count > 6144) cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; else cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; if (cnt.v_free_count > 2048) { cnt.v_cache_min = cnt.v_free_target; cnt.v_cache_max = 2 * cnt.v_cache_min; cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; } else { cnt.v_cache_min = 0; cnt.v_cache_max = 0; cnt.v_inactive_target = cnt.v_free_count / 4; } if (cnt.v_inactive_target > cnt.v_free_count / 3) cnt.v_inactive_target = cnt.v_free_count / 3; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; if (vm_pageout_stats_max == 0) vm_pageout_stats_max = cnt.v_free_target; /* * Set interval in seconds for stats scan. */ if (vm_pageout_stats_interval == 0) vm_pageout_stats_interval = 5; if (vm_pageout_full_stats_interval == 0) vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; /* * Set maximum free per pass */ if (vm_pageout_stats_free_max == 0) vm_pageout_stats_free_max = 5; swap_pager_swap_init(); pass = 0; /* * The pageout daemon is never done, so loop forever. */ while (TRUE) { s = splvm(); vm_page_lock_queues(); /* * If we have enough free memory, wakeup waiters. Do * not clear vm_pages_needed until we reach our target, * otherwise we may be woken up over and over again and * waste a lot of cpu. */ if (vm_pages_needed && !vm_page_count_min()) { if (!vm_paging_needed()) vm_pages_needed = 0; wakeup(&cnt.v_free_count); } if (vm_pages_needed) { /* * Still not done, take a second pass without waiting * (unlimited dirty cleaning), otherwise sleep a bit * and try again. */ ++pass; if (pass > 1) msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM, "psleep", hz/2); } else { /* * Good enough, sleep & handle stats. Prime the pass * for the next run. */ if (pass > 1) pass = 1; else pass = 0; error = msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM, "psleep", vm_pageout_stats_interval * hz); if (error && !vm_pages_needed) { vm_page_unlock_queues(); splx(s); pass = 0; vm_pageout_page_stats(); continue; } } if (vm_pages_needed) cnt.v_pdwakeups++; vm_page_unlock_queues(); splx(s); vm_pageout_scan(pass); } } /* * Unless the page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &cnt.v_free_count following this function unless * the page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup() { if (!vm_pages_needed && curthread->td_proc != pageproc) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon() { static int lastrun = 0; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } } static void vm_daemon() { struct proc *p; int breakout; struct thread *td; mtx_lock(&Giant); while (TRUE) { tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0); if (vm_pageout_req_swapout) { swapout_procs(vm_pageout_req_swapout); vm_pageout_req_swapout = 0; } /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_flag & (P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ mtx_lock_spin(&sched_lock); breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { breakout = 1; break; } } mtx_unlock_spin(&sched_lock); if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ limit = OFF_TO_IDX( qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, p->p_rlimit[RLIMIT_RSS].rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_sflag & PS_INMEM) == 0) limit = 0; /* XXX */ PROC_UNLOCK(p); size = vmspace_resident_count(p->p_vmspace); if (limit >= 0 && size >= limit) { vm_pageout_map_deactivate_pages( &p->p_vmspace->vm_map, limit); } } sx_sunlock(&allproc_lock); } } #endif /* !defined(NO_SWAPPING) */ Index: head/sys/vm/vm_pageout.h =================================================================== --- head/sys/vm/vm_pageout.h (revision 121225) +++ head/sys/vm/vm_pageout.h (revision 121226) @@ -1,113 +1,113 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_PAGEOUT_H_ #define _VM_VM_PAGEOUT_H_ /* * Header file for pageout daemon. */ /* * Exported data structures. */ extern int vm_page_max_wired; extern int vm_pages_needed; /* should be some "event" structure */ extern int vm_pageout_pages_needed; extern int vm_pageout_deficit; extern int vm_pageout_page_count; /* * Swap out requests */ #define VM_SWAP_NORMAL 1 #define VM_SWAP_IDLE 2 /* * Exported routines. */ /* * Signal pageout-daemon and wait for it. */ extern void pagedaemon_wakeup(void); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() extern void vm_wait(void); extern void vm_waitpfault(void); /* XXX This is probably misplaced. */ #ifndef NO_SWAPPING struct swdevt; void vm_proc_swapin_all(struct swdevt *); #endif /* !NO_SWAPPING */ #ifdef _KERNEL -int vm_pageout_flush(vm_page_t *, int, int, int is_object_locked); +int vm_pageout_flush(vm_page_t *, int, int); #endif #endif /* _VM_VM_PAGEOUT_H_ */