Index: user/jeff/numa/sys/vm/vm_pageout.c =================================================================== --- user/jeff/numa/sys/vm/vm_pageout.c (revision 329603) +++ user/jeff/numa/sys/vm/vm_pageout.c (revision 329604) @@ -1,2157 +1,2156 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); /* Pagedaemon activity rates, in subdivisions of one second. */ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 2 static int vm_pageout_oom_seq = 12; static int vm_pageout_update_period; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; static int swapdev_enabled; static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); static int act_scan_laundry_weight = 3; SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, "background laundering rate, in kilobytes per second"); static u_int vm_background_launder_max = 20 * 1024; SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); int vm_pageout_page_count = 32; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static u_int isqrt(u_int num); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* * Initialize a dummy page for marking the caller's place in the specified * paging queue. In principle, this function only needs to set the flag * PG_MARKER. Nonetheless, it write busies and initializes the hold count * to one as safety precautions. */ static void vm_pageout_init_marker(vm_page_t marker, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } /* * vm_pageout_fallback_object_lock: * * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is * known to have failed and page queue must be either PQ_ACTIVE or * PQ_INACTIVE. To avoid lock order violation, unlock the page queue * while locking the vm object. Use marker page to detect page queue * changes and maintain notion of next page on page queue. Return * TRUE if no changes were detected, FALSE otherwise. vm object is * locked on return. * * This function depends on both the lock portion of struct vm_object * and normal struct vm_page being type stable. */ static boolean_t vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_object_t object; queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_unlock(m); VM_OBJECT_WLOCK(object); vm_page_lock(m); vm_pagequeue_lock(pq); /* * The page's object might have changed, and/or the page might * have moved from its original position in the queue. If the * page's object has changed, then the caller should abandon * processing the page because the wrong object lock was * acquired. Use the marker's plinks.q, not the page's, to * determine if the page has been moved. The state of the * page's plinks.q can be indeterminate; whereas, the marker's * plinks.q must be valid. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m->object == object && m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Lock the page while holding the page queue lock. Use marker page * to detect page queue changes and maintain notion of next page on * page queue. Return TRUE if no changes were detected, FALSE * otherwise. The page is locked on return. The page queue lock might * be dropped and reacquired. * * This function depends on normal struct vm_page being type stable. */ static boolean_t vm_pageout_page_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_page_lock_assert(m, MA_NOTOWNED); if (vm_page_trylock(m)) return (TRUE); queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_lock(m); vm_pagequeue_lock(pq); /* Page queue might have changed. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; vm_page_assert_unbusied(m); KASSERT(!vm_page_held(m), ("page %p is held", m)); pmap_remove_write(m); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } vm_page_lock(p); if (!vm_page_in_laundry(p) || vm_page_held(p)) { vm_page_unlock(p); ib = 0; break; } pmap_remove_write(p); vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; vm_page_lock(p); if (!vm_page_in_laundry(p) || vm_page_held(p)) { vm_page_unlock(p); break; } pmap_remove_write(p); vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Mark the pages busy and verify that they're valid * and read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_sbusy(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: vm_page_lock(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); vm_page_unlock(mt); /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * The page is outside the object's range. We pretend * that the page out worked and clean the page, so the * changes will be lost if the page is reclaimed by * the page daemon. */ vm_page_undirty(mt); vm_page_lock(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If the page couldn't be paged out to swap because the * pager wasn't able to find space, place the page in * the PQ_UNSWAPPABLE holding queue. This is an * optimization that prevents the page daemon from * wasting CPU cycles on pages that cannot be reclaimed * becase no swap device is configured. * * Otherwise, reactivate the page so that it doesn't * clog the laundry and inactive queues. (We will try * paging it out again later.) */ vm_page_lock(mt); if (object->type == OBJT_SWAP && pageout_status[i] == VM_PAGER_FAIL) { vm_page_unswappable(mt); numpagedout++; } else vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } static void vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) { atomic_store_rel_int(&swapdev_enabled, 1); } static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) { if (swap_pager_nswapdev() == 1) atomic_store_rel_int(&swapdev_enabled, 0); } /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); /* * Ensure that the object and vnode were not disassociated * while locks were dropped. */ if (vp->v_object != object) { error = ENOENT; goto unlock_all; } vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied or referenced while the object * and page locks were released. */ if (vm_page_busied(m) || vm_page_held(m)) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * Attempt to launder the specified number of pages. * * Returns the number of pages successfully laundered. */ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, next; int act_delta, error, maxscan, numpagedout, starting_target; int vnodes_skipped; bool pageout_ok, queue_locked; starting_target = launder; vnodes_skipped = 0; /* * Scan the laundry queues for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once * we've reached the end of the queue. A single iteration of this loop * may cause more than one page to be laundered because of clustering. * * maxscan ensures that we don't re-examine requeued pages. Any * additional pages written as part of a cluster are subtracted from * maxscan since they must be taken from the laundry queue. * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]; else pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; scan: vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; queue_locked = true; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && launder > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked laundry queue")); KASSERT(vm_page_in_laundry(m), ("page %p has an inconsistent queue", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { vm_page_unlock(m); continue; } if (m->wire_count != 0) { vm_page_dequeue_locked(m); vm_page_unlock(m); continue; } object = m->object; if ((!VM_OBJECT_TRYWLOCK(object) && (!vm_pageout_fallback_object_lock(m, &next) || vm_page_held(m))) || vm_page_busied(m)) { VM_OBJECT_WUNLOCK(object); if (m->wire_count != 0 && vm_page_pagequeue(m) == pq) vm_page_dequeue_locked(m); vm_page_unlock(m); continue; } /* * Unlock the laundry queue, invalidating the 'next' pointer. * Use a marker to remember our place in the laundry queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, plinks.q); vm_pagequeue_unlock(pq); queue_locked = false; /* * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) act_delta += pmap_ts_referenced(m); else { KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { VM_CNT_INC(v_reactivated); vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the laundry queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; /* * If this was a background laundering, count * activated pages towards our target. The * purpose of background laundering is to ensure * that pages are eventually cycled through the * laundry queue, and an activation is a valid * way out. */ if (!in_shortfall) launder--; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) goto requeue_page; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } /* * Clean pages are freed, and dirty pages are paged out unless * they belong to a dead object. Requeueing dirty pages from * dead objects is pointless, as they are being paged out and * freed by the thread that destroyed the object. */ if (m->dirty == 0) { free_page: vm_page_free(m); VM_CNT_INC(v_dfree); } else if ((object->flags & OBJ_DEAD) == 0) { if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = true; else if (disable_swap_pageouts) pageout_ok = false; else pageout_ok = true; if (!pageout_ok) { requeue_page: vm_pagequeue_lock(pq); queue_locked = true; vm_page_requeue_locked(m); goto drop_page; } /* * Form a cluster with adjacent, dirty pages from the * same object, and page out that entire cluster. * * The adjacent, dirty pages must also be in the * laundry. However, their mappings are not checked * for new references. Consequently, a recently * referenced page may be paged out. However, that * page will not be prematurely reclaimed. After page * out, the page will be placed in the inactive queue, * where any new references will be detected and the * page reactivated. */ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; maxscan -= numpagedout - 1; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } goto relock_queue; } drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = true; } next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); } vm_pagequeue_unlock(pq); if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; goto scan; } /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ if (vnodes_skipped > 0 && launder > 0) (void)speedup_syncer(); return (starting_target - launder); } /* * Compute the integer square root. */ static u_int isqrt(u_int num) { u_int bit, root, tmp; bit = 1u << ((NBBY * sizeof(u_int)) - 2); while (bit > num) bit >>= 2; root = 0; while (bit != 0) { tmp = root + bit; root >>= 1; if (num >= tmp) { num -= tmp; root += bit; } bit >>= 2; } return (root); } /* * Perform the work of the laundry thread: periodically wake up and determine * whether any pages need to be laundered. If so, determine the number of pages * that need to be laundered, and launder them. */ static void vm_pageout_laundry_worker(void *arg) { struct vm_domain *vmd; struct vm_pagequeue *pq; uint64_t nclean, ndirty; u_int inactq_scans, last_launder; int domain, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vm_pageout_init_marker(&vmd->vmd_laundry_marker, PQ_LAUNDRY); shortfall = 0; in_shortfall = false; shortfall_cycle = 0; target = 0; inactq_scans = 0; last_launder = 0; /* * Calls to these handlers are serialized by the swap syscall lock. */ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd, EVENTHANDLER_PRI_ANY); (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd, EVENTHANDLER_PRI_ANY); /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { KASSERT(target >= 0, ("negative target %d", target)); KASSERT(shortfall_cycle >= 0, ("negative cycle %d", shortfall_cycle)); launder = 0; /* * First determine whether we need to launder pages to meet a * shortage of free pages. */ if (shortfall > 0) { in_shortfall = true; shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run * (and we are no longer in shortfall) or we have met * our laundry target through other activity, then we * can stop laundering pages. */ in_shortfall = false; target = 0; goto trybackground; } last_launder = inactq_scans; launder = target / shortfall_cycle--; goto dolaundry; /* * There's no immediate need to launder any pages; see if we * meet the conditions to perform background laundering: * * 1. The ratio of dirty to clean inactive pages exceeds the * background laundering threshold and the pagedaemon has * been woken up to reclaim pages since our last * laundering, or * 2. we haven't yet reached the target of the current * background laundering run. * * The background laundering threshold is not a constant. * Instead, it is a slowly growing function of the number of * page daemon scans since the last laundering. Thus, as the * ratio of dirty to clean inactive pages grows, the amount of * memory pressure required to trigger laundering decreases. */ trybackground: nclean = vmd->vmd_free_count + vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt; ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt; if (target == 0 && inactq_scans != last_launder && ndirty * isqrt(inactq_scans - last_launder) >= nclean) { target = vmd->vmd_background_launder_target; } /* * We have a non-zero background laundering target. If we've * laundered up to our maximum without observing a page daemon * request, just stop. This is a safety belt that ensures we * don't launder an excessive amount if memory pressure is low * and the ratio of dirty to clean pages is large. Otherwise, * proceed at the background laundering rate. */ if (target > 0) { if (inactq_scans != last_launder) { last_launder = inactq_scans; last_target = target; } else if (last_target - target >= vm_background_launder_max * PAGE_SIZE / 1024) { target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } dolaundry: if (launder > 0) { /* * Because of I/O clustering, the number of laundered * pages could exceed "target" by the maximum size of * a cluster minus one. */ target -= min(vm_pageout_launder(vmd, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } /* * If we're not currently laundering pages and the page daemon * hasn't posted a new request, sleep until the page daemon * kicks us. */ vm_pagequeue_lock(pq); if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE) (void)mtx_sleep(&vmd->vmd_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* * If the pagedaemon has indicated that it's in shortfall, start * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { shortfall = vm_laundry_target(vmd) + vmd->vmd_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) vmd->vmd_laundry_request = VM_LAUNDRY_IDLE; inactq_scans = vmd->vmd_inactq_scans; vm_pagequeue_unlock(pq); } } struct pgo_pglist { struct pglist pgl; int count; }; static void vm_pageout_pglist_init(struct pgo_pglist *pglist) { TAILQ_INIT(&pglist->pgl); pglist->count = 0; } static bool vm_pageout_pglist_append(struct pgo_pglist *pglist, vm_page_t m) { if (vm_page_free_prep(m, false)) { m->flags &= ~PG_ZERO; TAILQ_INSERT_TAIL(&pglist->pgl, m, listq); pglist->count++; return (true); } return (false); } static void vm_pageout_pglist_flush(struct pgo_pglist *pglist, bool force) { if (pglist->count > 64 || (force && pglist->count != 0)) { vm_page_free_phys_pglist(&pglist->pgl); vm_pageout_pglist_init(pglist); } } static int vm_pageout_free_pages(struct pgo_pglist *pglist, vm_object_t object, vm_page_t m) { vm_page_t p, pp; struct mtx *mtx; vm_pindex_t start; int pcount, count; pcount = MAX(object->iosize / PAGE_SIZE, 1); - count = 0; + count = 1; if (pcount == 1 || vm_object_reserv(object)) { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); goto out; } /* Find the first page in the block. */ start = m->pindex - (m->pindex % pcount); for (p = m; p->pindex > start && (pp = vm_page_prev(p)) != NULL; p = pp); /* Free the original page so we don't validate it twice. */ mtx = vm_page_lockptr(m); if (p == m) p = vm_page_next(m); - if (vm_pageout_pglist_append(pglist, m)) - count++; + vm_pageout_pglist_append(pglist, m); /* Iterate through the block range and free compatible pages. */ for (m = p; m != NULL && m->pindex < start + pcount; m = p) { p = TAILQ_NEXT(m, listq); if (mtx != vm_page_lockptr(m)) { mtx_unlock(mtx); mtx = vm_page_lockptr(m); mtx_lock(mtx); } - if (m->hold_count || vm_page_busied(m) || + if (vm_page_held(m) || vm_page_busied(m) || m->queue != PQ_INACTIVE) continue; if (m->valid == 0) goto free_page; if ((m->aflags & PGA_REFERENCED) != 0) continue; if (object->ref_count != 0) { if (pmap_ts_referenced(m)) { vm_page_aflag_set(m, PGA_REFERENCED); continue; } vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } if (m->dirty) continue; free_page: if (vm_pageout_pglist_append(pglist, m)) count++; } mtx_unlock(mtx); VM_OBJECT_WUNLOCK(object); vm_pageout_pglist_flush(pglist, false); out: VM_CNT_ADD(v_dfree, count); return (count); } /* * vm_pageout_scan does the dirty work for the pageout daemon. * * pass == 0: Update active LRU/deactivate pages * pass >= 1: Free inactive pages * * Returns true if pass was zero or enough pages were freed by the inactive * queue scan to meet the target. */ static bool vm_pageout_scan(struct vm_domain *vmd, int pass) { struct pgo_pglist pglist; vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; long min_scan; int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; int page_shortage, scan_tick, scanned, starting_page_shortage; boolean_t queue_locked; /* * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ if (vmd == VM_DOMAIN(0) && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(); lowmem_uptime = time_uptime; } /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Calculate the number of pages that we want to free. This number * can be negative if many pages are freed between the wakeup call to * the page daemon and this calculation. */ if (pass > 0) { deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); page_shortage = vm_paging_target(vmd) + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pageout_pglist_init(&pglist); vm_pagequeue_lock(pq); queue_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); VM_CNT_INC(v_pdpages); next = TAILQ_NEXT(m, plinks.q); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in inactive queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in inactive queue", m)); /* * The page or object lock acquisitions fail if the * page was removed from the queue or moved to a * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ if (!vm_pageout_page_lock(m, &next)) goto unlock_page; else if (m->wire_count != 0) { /* * Wired pages may not be freed, and unwiring a queued * page will cause it to be requeued. Thus, remove them * from the queue now to avoid unnecessary revisits. */ vm_page_dequeue_locked(m); addl_page_shortage++; goto unlock_page; } else if (m->hold_count != 0) { /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted * from the inactive count. See the * calculation of inactq_shortage before the * loop over the active queue below. */ addl_page_shortage++; goto unlock_page; } object = m->object; if (!VM_OBJECT_TRYWLOCK(object)) { if (!vm_pageout_fallback_object_lock(m, &next)) goto unlock_object; else if (m->wire_count != 0) { vm_page_dequeue_locked(m); addl_page_shortage++; goto unlock_object; } else if (m->hold_count != 0) { addl_page_shortage++; goto unlock_object; } } if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; unlock_object: VM_OBJECT_WUNLOCK(object); unlock_page: vm_page_unlock(m); continue; } KASSERT(!vm_page_held(m), ("Held page %p", m)); /* * Dequeue the inactive page and unlock the inactive page * queue, invalidating the 'next' pointer. Dequeueing the * page here avoids a later reacquisition (and release) of * the inactive page queue lock when vm_page_activate(), * vm_page_free(), or vm_page_launder() is called. Use a * marker to remember our place in the inactive queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); vm_page_dequeue_locked(m); vm_pagequeue_unlock(pq); queue_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { VM_CNT_INC(v_reactivated); vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the inactive queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) { vm_pagequeue_lock(pq); queue_locked = TRUE; m->queue = PQ_INACTIVE; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); goto drop_page; } } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } /* * Clean pages can be freed, but dirty pages must be sent back * to the laundry, unless they belong to a dead object. * Requeueing dirty pages from dead objects is pointless, as * they are being paged out and freed by the thread that * destroyed the object. */ if (m->dirty == 0) { free_page: page_shortage -= vm_pageout_free_pages(&pglist, object, m); goto lock_queue; } else if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); lock_queue: if (!queue_locked) { vm_pageout_pglist_flush(&pglist, false); vm_pagequeue_lock(pq); queue_locked = TRUE; } next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } vm_pagequeue_unlock(pq); vm_pageout_pglist_flush(&pglist, true); /* * Wake up the laundry thread so that it can perform any needed * laundering. If we didn't meet our target, we're in shortfall and * need to launder more aggressively. If PQ_LAUNDRY is empty and no * swap devices are configured, the laundry thread has no work to do, so * don't bother waking it up. * * The laundry thread uses the number of inactive queue scans elapsed * since the last laundering to determine whether to launder again, so * keep count. */ if (starting_page_shortage > 0) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE && (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { if (page_shortage > 0) { vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL; VM_CNT_INC(v_pdshortfalls); } else if (vmd->vmd_laundry_request != VM_LAUNDRY_SHORTFALL) vmd->vmd_laundry_request = VM_LAUNDRY_BACKGROUND; wakeup(&vmd->vmd_laundry_request); } vmd->vmd_inactq_scans++; vm_pagequeue_unlock(pq); } /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (page_shortage > 0) vm_swapout_run(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Compute the number of pages we want to try to move from the * active queue to either the inactive or laundry queue. * * When scanning active pages, we make clean pages count more heavily * towards the page shortage than dirty pages. This is because dirty * pages must be laundered before they can be reused and thus have less * utility when attempting to quickly alleviate a shortage. However, * this weighting also causes the scan to deactivate dirty pages more * more aggressively, improving the effectiveness of clustering and * ensuring that they can eventually be reused. */ inactq_shortage = vmd->vmd_inactive_target - (pq->pq_cnt + vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight) + vm_paging_target(vmd) + deficit + addl_page_shortage; inactq_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in active queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in active queue", m)); if (!vm_pageout_page_lock(m, &next)) { vm_page_unlock(m); continue; } /* * The count for page daemon pages is updated after checking * the page for eligibility. */ VM_CNT_INC(v_pdpages); /* * Wired pages are dequeued lazily. */ if (m->wire_count != 0) { vm_page_dequeue_locked(m); vm_page_unlock(m); continue; } /* * Check to see "how much" the page has been used. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; /* * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ if (m->object->ref_count != 0) act_delta += pmap_ts_referenced(m); /* * Advance or decay the act_count based on recent usage. */ if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } else m->act_count -= min(m->act_count, ACT_DECLINE); /* * Move this page to the tail of the active, inactive or laundry * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); /* * When not short for inactive pages, let dirty pages go * through the inactive queue before moving to the * laundry queues. This gives them some extra time to * be reactivated, potentially avoiding an expensive * pageout. During a page shortage, the inactive queue * is necessarily small, so we may move dirty pages * directly to the laundry queue. */ if (inactq_shortage <= 0) vm_page_deactivate(m); else { /* * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, * directing dirty pages into the laundry * queue is only an optimization and not a * requirement. Therefore, we simply rely on * the opportunistic updates to the page's * dirty field by the pmap. */ if (m->dirty == 0) { vm_page_deactivate(m); inactq_shortage -= act_scan_laundry_weight; } else { vm_page_launder(m); inactq_shortage--; } } } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); if (pass > 0) vm_swapout_run_idle(); return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; bool breakout; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = false; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = true; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { int i; if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); for (i = 0; i < vm_ndomains; i++) wakeup(&VM_DOMAIN(i)->vmd_free_count); } } static void vm_pageout_worker(void *arg) { struct vm_domain *vmd; int domain, pass; bool target_met; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); pass = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vmd->vmd_last_active_scan = ticks; vm_pageout_init_marker(&vmd->vmd_marker, PQ_INACTIVE); vm_pageout_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { vm_domain_free_lock(vmd); /* * Generally, after a level >= 1 scan, if there are enough * free pages to wakeup the waiters, then they are already * awake. A call to vm_page_free() during the scan awakened * them. However, in the following case, this wakeup serves * to bound the amount of time that a thread might wait. * Suppose a thread's call to vm_page_alloc() fails, but * before that thread calls VM_WAIT, enough pages are freed by * other threads to alleviate the free page shortage. The * thread will, nonetheless, wait until another page is freed * or this wakeup is performed. */ if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) { vmd->vmd_pages_needed = false; wakeup(&vmd->vmd_free_count); } /* * Do not clear vmd_pageout_wanted until we reach our free page * target. Otherwise, we may be awakened over and over again, * wasting CPU time. */ if (vmd->vmd_pageout_wanted && target_met) vmd->vmd_pageout_wanted = false; /* * Might the page daemon receive a wakeup call? */ if (vmd->vmd_pageout_wanted) { /* * No. Either vmd_pageout_wanted was set by another * thread during the previous scan, which must have * been a level 0 scan, or vmd_pageout_wanted was * already set and the scan failed to free enough * pages. If we haven't yet performed a level >= 1 * (page reclamation) scan, then increase the level * and scan again now. Otherwise, sleep a bit and * try again later. */ vm_domain_free_unlock(vmd); if (pass >= 1) pause("pwait", hz / VM_INACT_SCAN_RATE); pass++; } else { /* * Yes. If threads are still sleeping in VM_WAIT * then we immediately start a new scan. Otherwise, * sleep until the next wakeup or until pages need to * have their reference stats updated. */ if (vmd->vmd_pages_needed) { vm_domain_free_unlock(vmd); if (pass == 0) pass++; } else if (mtx_sleep(&vmd->vmd_pageout_wanted, vm_domain_free_lockptr(vmd), PDROP | PVM, "psleep", hz) == 0) { VM_CNT_INC(v_pdwakeups); pass = 1; } else pass = 0; } target_met = vm_pageout_scan(vmd, pass); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init_domain(int domain) { struct vm_domain *vmd; int lim, i, j; vmd = VM_DOMAIN(domain); vmd->vmd_interrupt_free_min = 2; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vmd->vmd_page_count > 1024) vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200; else vmd->vmd_free_min = 4; vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + vmd->vmd_interrupt_free_min; vmd->vmd_free_reserved = vm_pageout_page_count + vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768); vmd->vmd_free_severe = vmd->vmd_free_min / 2; vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved; vmd->vmd_free_min += vmd->vmd_free_reserved; vmd->vmd_free_severe += vmd->vmd_free_reserved; vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2; if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3) vmd->vmd_inactive_target = vmd->vmd_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_min / 10) * 11; /* * Target amount of memory to move out of the laundry queue during a * background laundering. This is proportional to the amount of system * memory. */ vmd->vmd_background_launder_target = (vmd->vmd_free_target - vmd->vmd_free_min) / 10; /* * Set batch queue limits for paging queues. * * We want these to be small relative to the amount of system memory. * Roughly v_page_count / PA_LOCK_COUNT pages are mapped to a given * batch queue; ensure that no more than 0.1% of them may be queued in * the batch queue for a particular page queue. Then no more than * 0.1% * PQ_COUNT can be queued across all page queues. This gives a * per-page queue batch limit of 1 page per GB of memory on amd64. */ lim = MAX(vmd->vmd_page_count / 1000 / BPQ_COUNT, 8); for (i = 0; i < PQ_COUNT; i++) for (j = 0; j < BPQ_COUNT; j++) vmd->vmd_pagequeues[i].pq_bpqs[j].bpq_lim = lim; } static void vm_pageout_init(void) { u_int freecount; int i; /* * Initialize some paging parameters. */ if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; freecount = 0; for (i = 0; i < vm_ndomains; i++) { struct vm_domain *vmd; vm_pageout_init_domain(i); vmd = VM_DOMAIN(i); vm_cnt.v_free_reserved += vmd->vmd_free_reserved; vm_cnt.v_free_target += vmd->vmd_free_target; vm_cnt.v_free_min += vmd->vmd_free_min; vm_cnt.v_inactive_target += vmd->vmd_inactive_target; vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min; vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min; vm_cnt.v_free_severe += vmd->vmd_free_severe; freecount += vmd->vmd_free_count; } /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; if (vm_page_max_wired == 0) vm_page_max_wired = freecount / 3; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { int error; int i; swap_pager_swap_init(); error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, 0, 0, "laundry: dom0"); if (error != 0) panic("starting laundry for domain 0, error %d", error); for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { panic("starting pageout for domain %d, error %d\n", i, error); } error = kthread_add(vm_pageout_laundry_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "laundry: dom%d", i); if (error != 0) panic("starting laundry for domain %d, error %d", i, error); } error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); vm_pageout_worker((void *)(uintptr_t)0); } /* * Perform an advisory wakeup of the page daemon. */ void pagedaemon_wakeup(int domain) { struct vm_domain *vmd; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (!vmd->vmd_pageout_wanted && curthread->td_proc != pageproc) { vmd->vmd_pageout_wanted = true; wakeup(&vmd->vmd_pageout_wanted); } } /* * Wake up the page daemon and wait for it to reclaim free pages. * * This function returns with the free queues mutex unlocked. */ void pagedaemon_wait(int domain, int pri, const char *wmesg) { struct vm_domain *vmd; vmd = VM_DOMAIN(domain); vm_domain_free_assert_locked(vmd); /* * vmd_pageout_wanted may have been set by an advisory wakeup, but if * the page daemon is running on a CPU, the wakeup will have been lost. * Thus, deliver a potentially spurious wakeup to ensure that the page * daemon has been notified of the shortage. */ if (!vmd->vmd_pageout_wanted || !vmd->vmd_pages_needed) { vmd->vmd_pageout_wanted = true; wakeup(&vmd->vmd_pageout_wanted); } vmd->vmd_pages_needed = true; vmd->vmd_waiters++; msleep(&vmd->vmd_free_count, vm_domain_free_lockptr(vmd), PDROP | pri, wmesg, 0); vmd->vmd_waiters--; }