Index: head/sys/sys/vmmeter.h =================================================================== --- head/sys/sys/vmmeter.h +++ head/sys/sys/vmmeter.h @@ -75,9 +75,10 @@ u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */ u_int v_vnodepgsout; /* (p) vnode pager pages paged out */ u_int v_intrans; /* (p) intransit blocking page faults */ - u_int v_reactivated; /* (f) pages reactivated from free list */ + u_int v_reactivated; /* (p) pages reactivated by the pagedaemon */ u_int v_pdwakeups; /* (p) times daemon has awaken from sleep */ u_int v_pdpages; /* (p) pages analyzed by daemon */ + u_int v_pdshortfalls; /* (p) page reclamation shortfalls */ u_int v_tcached; /* (p) total pages cached */ u_int v_dfree; /* (p) pages freed by daemon */ @@ -96,6 +97,7 @@ u_int v_active_count; /* (q) pages active */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ + u_int v_laundry_count; /* (q) pages eligible for laundering */ u_int v_cache_count; /* (f) pages on cache queue */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ @@ -111,7 +113,6 @@ u_int v_vforkpages; /* (p) VM pages affected by vfork() */ u_int v_rforkpages; /* (p) VM pages affected by rfork() */ u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */ - u_int v_spare[2]; }; #ifdef _KERNEL @@ -184,6 +185,25 @@ vm_pageout_wakeup_thresh); } +/* + * Return the number of pages we need to launder. + * A positive number indicates that we have a shortfall of clean pages. + */ +static inline int +vm_laundry_target(void) +{ + + return (vm_paging_target()); +} + +/* + * Obtain the value of a per-CPU counter. + */ +#define VM_METER_PCPU_CNT(member) \ + vm_meter_cnt(__offsetof(struct vmmeter, member)) + +u_int vm_meter_cnt(size_t); + #endif /* systemwide totals computed every five seconds */ Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c +++ head/sys/vm/swap_pager.c @@ -1549,17 +1549,18 @@ * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). + * A page is only written to swap after a period of + * inactivity. Therefore, we do not expect it to be + * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); + vm_page_lock(m); + vm_page_deactivate_noreuse(m); + vm_page_unlock(m); vm_page_sunbusy(m); - if (vm_page_count_severe()) { - vm_page_lock(m); - vm_page_try_to_cache(m); - vm_page_unlock(m); - } } } @@ -1635,12 +1636,15 @@ /* * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * - * This routine dissociates the page at the given index within a - * swap block from its backing store, paging it in if necessary. - * If the page is paged in, it is placed in the inactive queue, - * since it had its backing store ripped out from under it. - * We also attempt to swap in all other pages in the swap block, - * we only guarantee that the one at the specified index is + * This routine dissociates the page at the given index within an object + * from its backing store, paging it in if it does not reside in memory. + * If the page is paged in, it is marked dirty and placed in the laundry + * queue. The page is marked dirty because it no longer has backing + * store. It is placed in the laundry queue because it has not been + * accessed recently. Otherwise, it would already reside in memory. + * + * We also attempt to swap in all other pages in the swap block. + * However, we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we @@ -1669,7 +1673,7 @@ vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); - vm_page_deactivate(m); + vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c +++ head/sys/vm/vm_fault.c @@ -290,12 +290,13 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { - vm_prot_t prot; - vm_object_t next_object; struct faultstate fs; struct vnode *vp; + vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; vm_page_t m; + vm_pindex_t retry_pindex; + vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; int locked, map_generation, nera, result, rv; u_char behavior; @@ -946,10 +947,6 @@ * lookup. */ if (!fs.lookup_still_valid) { - vm_object_t retry_object; - vm_pindex_t retry_pindex; - vm_prot_t retry_prot; - if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); Index: head/sys/vm/vm_meter.c =================================================================== --- head/sys/vm/vm_meter.c +++ head/sys/vm/vm_meter.c @@ -216,29 +216,37 @@ } /* - * vcnt() - accumulate statistics from all cpus and the global cnt - * structure. + * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt + * structure. * * The vmmeter structure is now per-cpu as well as global. Those * statistics which can be kept on a per-cpu basis (to avoid cache * stalls between cpus) can be moved to the per-cpu vmmeter. Remaining * statistics, such as v_free_reserved, are left in the global * structure. - * - * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) */ -static int -vcnt(SYSCTL_HANDLER_ARGS) +u_int +vm_meter_cnt(size_t offset) { - int count = *(int *)arg1; - int offset = (char *)arg1 - (char *)&vm_cnt; + struct pcpu *pcpu; + u_int count; int i; + count = *(u_int *)((char *)&vm_cnt + offset); CPU_FOREACH(i) { - struct pcpu *pcpu = pcpu_find(i); - count += *(int *)((char *)&pcpu->pc_cnt + offset); + pcpu = pcpu_find(i); + count += *(u_int *)((char *)&pcpu->pc_cnt + offset); } - return (SYSCTL_OUT(req, &count, sizeof(int))); + return (count); +} + +static int +cnt_sysctl(SYSCTL_HANDLER_ARGS) +{ + u_int count; + + count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt); + return (SYSCTL_OUT(req, &count, sizeof(count))); } SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, @@ -253,8 +261,8 @@ #define VM_STATS(parent, var, descr) \ SYSCTL_PROC(parent, OID_AUTO, var, \ - CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, vcnt, \ - "IU", descr) + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \ + cnt_sysctl, "IU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) @@ -278,9 +286,10 @@ VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); -VM_STATS_VM(v_reactivated, "Pages reactivated from free list"); +VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon"); +VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_tcached, "Total pages cached"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); @@ -295,6 +304,7 @@ VM_STATS_VM(v_active_count, "Active pages"); VM_STATS_VM(v_inactive_target, "Desired inactive pages"); VM_STATS_VM(v_inactive_count, "Inactive pages"); +VM_STATS_VM(v_laundry_count, "Pages eligible for laundering"); VM_STATS_VM(v_cache_count, "Pages on cache queue"); VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c +++ head/sys/vm/vm_object.c @@ -2329,9 +2329,9 @@ * sysctl is only meant to give an * approximation of the system anyway. */ - if (m->queue == PQ_ACTIVE) + if (vm_page_active(m)) kvo.kvo_active++; - else if (m->queue == PQ_INACTIVE) + else if (vm_page_inactive(m)) kvo.kvo_inactive++; } Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h +++ head/sys/vm/vm_page.h @@ -206,7 +206,8 @@ #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 -#define PQ_COUNT 2 +#define PQ_LAUNDRY 2 +#define PQ_COUNT 3 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); @@ -228,6 +229,7 @@ boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; + struct vm_page vmd_laundry_marker; struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ }; @@ -236,6 +238,7 @@ #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) +#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #ifdef _KERNEL @@ -327,7 +330,6 @@ #define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ #define PG_ZERO 0x0008 /* page is zeroed */ #define PG_MARKER 0x0010 /* special queue marker page */ -#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */ #define PG_NODUMP 0x0080 /* don't include this page in a dump */ #define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */ @@ -451,10 +453,8 @@ vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); -void vm_page_cache(vm_page_t); void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); -int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); @@ -465,6 +465,7 @@ void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); +void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); @@ -697,5 +698,26 @@ (void)mret; } +static inline bool +vm_page_active(vm_page_t m) +{ + + return (m->queue == PQ_ACTIVE); +} + +static inline bool +vm_page_inactive(vm_page_t m) +{ + + return (m->queue == PQ_INACTIVE); +} + +static inline bool +vm_page_in_laundry(vm_page_t m) +{ + + return (m->queue == PQ_LAUNDRY); +} + #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c +++ head/sys/vm/vm_page.c @@ -390,6 +390,10 @@ "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = + "vm laundry pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = + &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; @@ -1730,9 +1734,7 @@ ("vm_page_alloc: cached page %p is PG_ZERO", m)); KASSERT(m->valid != 0, ("vm_page_alloc: cached page %p is invalid", m)); - if (m->object == object && m->pindex == pindex) - vm_cnt.v_reactivated++; - else + if (m->object != object || m->pindex != pindex) m->valid = 0; m_object = m->object; vm_page_cache_remove(m); @@ -2254,7 +2256,7 @@ } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); - /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) @@ -2450,7 +2452,7 @@ } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); - /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) @@ -2778,7 +2780,10 @@ vm_page_pagequeue(vm_page_t m) { - return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); + if (vm_page_in_laundry(m)) + return (&vm_dom[0].vmd_pagequeues[m->queue]); + else + return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* @@ -2840,7 +2845,10 @@ KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); - pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; + if (queue == PQ_LAUNDRY) + pq = &vm_dom[0].vmd_pagequeues[queue]; + else + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); @@ -3124,11 +3132,8 @@ if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && - m->object != NULL && queue != PQ_NONE) { - if (queue == PQ_INACTIVE) - m->flags &= ~PG_WINATCFLS; + m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); - } return (TRUE); } else return (FALSE); @@ -3181,7 +3186,6 @@ } else { if (queue != PQ_NONE) vm_page_dequeue(m); - m->flags &= ~PG_WINATCFLS; vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; @@ -3221,24 +3225,25 @@ } /* - * vm_page_try_to_cache: + * vm_page_launder * - * Returns 0 on failure, 1 on success + * Put a page in the laundry. */ -int -vm_page_try_to_cache(vm_page_t m) +void +vm_page_launder(vm_page_t m) { + int queue; - vm_page_lock_assert(m, MA_OWNED); - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (m->dirty || m->hold_count || m->wire_count || - (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) - return (0); - pmap_remove_all(m); - if (m->dirty) - return (0); - vm_page_cache(m); - return (1); + vm_page_assert_locked(m); + if ((queue = m->queue) != PQ_LAUNDRY) { + if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { + if (queue != PQ_NONE) + vm_page_dequeue(m); + vm_page_enqueue(PQ_LAUNDRY, m); + } else + KASSERT(queue == PQ_NONE, + ("wired page %p is queued", m)); + } } /* @@ -3265,112 +3270,6 @@ } /* - * vm_page_cache - * - * Put the specified page onto the page cache queue (if appropriate). - * - * The object and page must be locked. - */ -void -vm_page_cache(vm_page_t m) -{ - vm_object_t object; - boolean_t cache_was_empty; - - vm_page_lock_assert(m, MA_OWNED); - object = m->object; - VM_OBJECT_ASSERT_WLOCKED(object); - if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) || - m->hold_count || m->wire_count) - panic("vm_page_cache: attempting to cache busy page"); - KASSERT(!pmap_page_is_mapped(m), - ("vm_page_cache: page %p is mapped", m)); - KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m)); - if (m->valid == 0 || object->type == OBJT_DEFAULT || - (object->type == OBJT_SWAP && - !vm_pager_has_page(object, m->pindex, NULL, NULL))) { - /* - * Hypothesis: A cache-eligible page belonging to a - * default object or swap object but without a backing - * store must be zero filled. - */ - vm_page_free(m); - return; - } - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_cache: page %p is already cached", m)); - - /* - * Remove the page from the paging queues. - */ - vm_page_remque(m); - - /* - * Remove the page from the object's collection of resident - * pages. - */ - vm_radix_remove(&object->rtree, m->pindex); - TAILQ_REMOVE(&object->memq, m, listq); - object->resident_page_count--; - - /* - * Restore the default memory attribute to the page. - */ - if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) - pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); - - /* - * Insert the page into the object's collection of cached pages - * and the physical memory allocator's cache/free page queues. - */ - m->flags &= ~PG_ZERO; - mtx_lock(&vm_page_queue_free_mtx); - cache_was_empty = vm_radix_is_empty(&object->cache); - if (vm_radix_insert(&object->cache, m)) { - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && - object->resident_page_count == 0) - vdrop(object->handle); - m->object = NULL; - vm_page_free(m); - return; - } - - /* - * The above call to vm_radix_insert() could reclaim the one pre- - * existing cached page from this object, resulting in a call to - * vdrop(). - */ - if (!cache_was_empty) - cache_was_empty = vm_radix_is_singleton(&object->cache); - - m->flags |= PG_CACHED; - vm_cnt.v_cache_count++; - PCPU_INC(cnt.v_tcached); -#if VM_NRESERVLEVEL > 0 - if (!vm_reserv_free_page(m)) { -#else - if (TRUE) { -#endif - vm_phys_free_pages(m, 0); - } - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); - - /* - * Increment the vnode's hold count if this is the object's only - * cached page. Decrement the vnode's hold count if this was - * the object's only resident page. - */ - if (object->type == OBJT_VNODE) { - if (cache_was_empty && object->resident_page_count != 0) - vhold(object->handle); - else if (!cache_was_empty && object->resident_page_count == 0) - vdrop(object->handle); - } -} - -/* * vm_page_advise * * Deactivate or do nothing, as appropriate. @@ -3413,11 +3312,13 @@ /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that - * the page will be reused quickly. Dirty pages are given a chance to - * cycle once through the inactive queue before becoming eligible for - * laundering. + * the page will be reused quickly. Dirty pages not already in the + * laundry are moved there. */ - _vm_page_deactivate(m, m->dirty == 0); + if (m->dirty == 0) + vm_page_deactivate_noreuse(m); + else + vm_page_launder(m); } /* @@ -3926,6 +3827,7 @@ db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); + db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); @@ -3940,12 +3842,14 @@ db_printf("pq_free %d pq_cache %d\n", vm_cnt.v_free_count, vm_cnt.v_cache_count); for (dom = 0; dom < vm_ndomains; dom++) { - db_printf("dom %d page_cnt %d free %d pq_act %d pq_inact %d\n", + db_printf( + "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, - vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt); + vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, + vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt); } } Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c +++ head/sys/vm/vm_pageout.c @@ -119,7 +119,7 @@ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); -static int vm_pageout_clean(vm_page_t m); +static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, @@ -154,6 +154,9 @@ SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif +/* Pagedaemon activity rates, in subdivisions of one second. */ +#define VM_LAUNDER_RATE 10 +#define VM_INACT_SCAN_RATE 2 int vm_pageout_deficit; /* Estimated number of pages deficit */ u_int vm_pageout_wakeup_thresh; @@ -161,6 +164,13 @@ bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ bool vm_pages_needed; /* Are threads waiting for free pages? */ +/* Pending request for dirty page laundering. */ +static enum { + VM_LAUNDRY_IDLE, + VM_LAUNDRY_BACKGROUND, + VM_LAUNDRY_SHORTFALL +} vm_laundry_request = VM_LAUNDRY_IDLE; + #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; @@ -168,9 +178,7 @@ /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif -static int vm_max_launder = 32; static int vm_pageout_update_period; -static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; @@ -193,9 +201,6 @@ CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); -SYSCTL_INT(_vm, OID_AUTO, max_launder, - CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); - SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); @@ -215,9 +220,6 @@ CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif -SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, - CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); - SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); @@ -229,6 +231,25 @@ CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); +static int act_scan_laundry_weight = 3; +SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW, + &act_scan_laundry_weight, 0, + "weight given to clean vs. dirty pages in active queue scans"); + +static u_int vm_background_launder_target; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW, + &vm_background_launder_target, 0, + "background laundering target, in pages"); + +static u_int vm_background_launder_rate = 4096; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW, + &vm_background_launder_rate, 0, + "background laundering rate, in kilobytes per second"); + +static u_int vm_background_launder_max = 20 * 1024; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW, + &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); + #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -236,7 +257,11 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); +static u_int isqrt(u_int num); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); +static int vm_pageout_launder(struct vm_domain *vmd, int launder, + bool in_shortfall); +static void vm_pageout_laundry_worker(void *arg); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); @@ -387,7 +412,7 @@ /* * We can cluster only if the page is not clean, busy, or held, and - * the page is inactive. + * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file @@ -413,7 +438,7 @@ break; } vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; @@ -439,7 +464,7 @@ if (p->dirty == 0) break; vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; @@ -519,23 +544,33 @@ ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); + /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* - * Page outside of range of object. Right now we - * essentially lose the changes by pretending it - * worked. + * The page is outside the object's range. We pretend + * that the page out worked and clean the page, so the + * changes will be lost if the page is reclaimed by + * the page daemon. */ vm_page_undirty(mt); + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* - * If page couldn't be paged out, then reactivate the - * page so it doesn't clog the inactive list. (We - * will try paging out it again later). + * If the page couldn't be paged out, then reactivate + * it so that it doesn't clog the laundry and inactive + * queues. (We will try paging it out again later). */ vm_page_lock(mt); vm_page_activate(mt); @@ -617,10 +652,10 @@ act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } - if (p->queue != PQ_ACTIVE && act_delta != 0) { + if (!vm_page_active(p) && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; - } else if (p->queue == PQ_ACTIVE) { + } else if (vm_page_active(p)) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); @@ -636,7 +671,7 @@ p->act_count += ACT_ADVANCE; vm_page_requeue(p); } - } else if (p->queue == PQ_INACTIVE) + } else if (vm_page_inactive(p)) pmap_remove_all(p); vm_page_unlock(p); } @@ -739,7 +774,7 @@ * Returns 0 on success and an errno otherwise. */ static int -vm_pageout_clean(vm_page_t m) +vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; @@ -797,7 +832,7 @@ * (3) reallocated to a different offset, or * (4) cleaned. */ - if (m->queue != PQ_INACTIVE || m->object != object || + if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; @@ -821,7 +856,7 @@ * laundry. If it is still in the laundry, then we * start the cleaning operation. */ - if (vm_pageout_cluster(m) == 0) + if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: @@ -840,11 +875,390 @@ } /* + * Attempt to launder the specified number of pages. + * + * Returns the number of pages successfully laundered. + */ +static int +vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) +{ + struct vm_pagequeue *pq; + vm_object_t object; + vm_page_t m, next; + int act_delta, error, maxscan, numpagedout, starting_target; + int vnodes_skipped; + bool pageout_ok, queue_locked; + + starting_target = launder; + vnodes_skipped = 0; + + /* + * Scan the laundry queue for pages eligible to be laundered. We stop + * once the target number of dirty pages have been laundered, or once + * we've reached the end of the queue. A single iteration of this loop + * may cause more than one page to be laundered because of clustering. + * + * maxscan ensures that we don't re-examine requeued pages. Any + * additional pages written as part of a cluster are subtracted from + * maxscan since they must be taken from the laundry queue. + */ + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + maxscan = pq->pq_cnt; + + vm_pagequeue_lock(pq); + queue_locked = true; + for (m = TAILQ_FIRST(&pq->pq_pl); + m != NULL && maxscan-- > 0 && launder > 0; + m = next) { + vm_pagequeue_assert_locked(pq); + KASSERT(queue_locked, ("unlocked laundry queue")); + KASSERT(vm_page_in_laundry(m), + ("page %p has an inconsistent queue", m)); + next = TAILQ_NEXT(m, plinks.q); + if ((m->flags & PG_MARKER) != 0) + continue; + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); + if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { + vm_page_unlock(m); + continue; + } + object = m->object; + if ((!VM_OBJECT_TRYWLOCK(object) && + (!vm_pageout_fallback_object_lock(m, &next) || + m->hold_count != 0)) || vm_page_busied(m)) { + VM_OBJECT_WUNLOCK(object); + vm_page_unlock(m); + continue; + } + + /* + * Unlock the laundry queue, invalidating the 'next' pointer. + * Use a marker to remember our place in the laundry queue. + */ + TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, + plinks.q); + vm_pagequeue_unlock(pq); + queue_locked = false; + + /* + * Invalid pages can be easily freed. They cannot be + * mapped; vm_page_free() asserts this. + */ + if (m->valid == 0) + goto free_page; + + /* + * If the page has been referenced and the object is not dead, + * reactivate or requeue the page depending on whether the + * object is mapped. + */ + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta = 1; + } else + act_delta = 0; + if (object->ref_count != 0) + act_delta += pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + } + if (act_delta != 0) { + if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); + vm_page_activate(m); + + /* + * Increase the activation count if the page + * was referenced while in the laundry queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; + + /* + * If this was a background laundering, count + * activated pages towards our target. The + * purpose of background laundering is to ensure + * that pages are eventually cycled through the + * laundry queue, and an activation is a valid + * way out. + */ + if (!in_shortfall) + launder--; + goto drop_page; + } else if ((object->flags & OBJ_DEAD) == 0) + goto requeue_page; + } + + /* + * If the page appears to be clean at the machine-independent + * layer, then remove all of its mappings from the pmap in + * anticipation of freeing it. If, however, any of the page's + * mappings allow write access, then the page may still be + * modified until the last of those mappings are removed. + */ + if (object->ref_count != 0) { + vm_page_test_dirty(m); + if (m->dirty == 0) + pmap_remove_all(m); + } + + /* + * Clean pages are freed, and dirty pages are paged out unless + * they belong to a dead object. Requeueing dirty pages from + * dead objects is pointless, as they are being paged out and + * freed by the thread that destroyed the object. + */ + if (m->dirty == 0) { +free_page: + vm_page_free(m); + PCPU_INC(cnt.v_dfree); + } else if ((object->flags & OBJ_DEAD) == 0) { + if (object->type != OBJT_SWAP && + object->type != OBJT_DEFAULT) + pageout_ok = true; + else if (disable_swap_pageouts) + pageout_ok = false; + else + pageout_ok = true; + if (!pageout_ok) { +requeue_page: + vm_pagequeue_lock(pq); + queue_locked = true; + vm_page_requeue_locked(m); + goto drop_page; + } + + /* + * Form a cluster with adjacent, dirty pages from the + * same object, and page out that entire cluster. + * + * The adjacent, dirty pages must also be in the + * laundry. However, their mappings are not checked + * for new references. Consequently, a recently + * referenced page may be paged out. However, that + * page will not be prematurely reclaimed. After page + * out, the page will be placed in the inactive queue, + * where any new references will be detected and the + * page reactivated. + */ + error = vm_pageout_clean(m, &numpagedout); + if (error == 0) { + launder -= numpagedout; + maxscan -= numpagedout - 1; + } else if (error == EDEADLK) { + pageout_lock_miss++; + vnodes_skipped++; + } + goto relock_queue; + } +drop_page: + vm_page_unlock(m); + VM_OBJECT_WUNLOCK(object); +relock_queue: + if (!queue_locked) { + vm_pagequeue_lock(pq); + queue_locked = true; + } + next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); + } + vm_pagequeue_unlock(pq); + + /* + * Wakeup the sync daemon if we skipped a vnode in a writeable object + * and we didn't launder enough pages. + */ + if (vnodes_skipped > 0 && launder > 0) + (void)speedup_syncer(); + + return (starting_target - launder); +} + +/* + * Compute the integer square root. + */ +static u_int +isqrt(u_int num) +{ + u_int bit, root, tmp; + + bit = 1u << ((NBBY * sizeof(u_int)) - 2); + while (bit > num) + bit >>= 2; + root = 0; + while (bit != 0) { + tmp = root + bit; + root >>= 1; + if (num >= tmp) { + num -= tmp; + root += bit; + } + bit >>= 2; + } + return (root); +} + +/* + * Perform the work of the laundry thread: periodically wake up and determine + * whether any pages need to be laundered. If so, determine the number of pages + * that need to be laundered, and launder them. + */ +static void +vm_pageout_laundry_worker(void *arg) +{ + struct vm_domain *domain; + struct vm_pagequeue *pq; + uint64_t nclean, ndirty; + u_int last_launder, wakeups; + int domidx, last_target, launder, shortfall, shortfall_cycle, target; + bool in_shortfall; + + domidx = (uintptr_t)arg; + domain = &vm_dom[domidx]; + pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; + KASSERT(domain->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); + + shortfall = 0; + in_shortfall = false; + shortfall_cycle = 0; + target = 0; + last_launder = 0; + + /* + * The pageout laundry worker is never done, so loop forever. + */ + for (;;) { + KASSERT(target >= 0, ("negative target %d", target)); + KASSERT(shortfall_cycle >= 0, + ("negative cycle %d", shortfall_cycle)); + launder = 0; + wakeups = VM_METER_PCPU_CNT(v_pdwakeups); + + /* + * First determine whether we need to launder pages to meet a + * shortage of free pages. + */ + if (shortfall > 0) { + in_shortfall = true; + shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; + target = shortfall; + } else if (!in_shortfall) + goto trybackground; + else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { + /* + * We recently entered shortfall and began laundering + * pages. If we have completed that laundering run + * (and we are no longer in shortfall) or we have met + * our laundry target through other activity, then we + * can stop laundering pages. + */ + in_shortfall = false; + target = 0; + goto trybackground; + } + last_launder = wakeups; + launder = target / shortfall_cycle--; + goto dolaundry; + + /* + * There's no immediate need to launder any pages; see if we + * meet the conditions to perform background laundering: + * + * 1. The ratio of dirty to clean inactive pages exceeds the + * background laundering threshold and the pagedaemon has + * been woken up to reclaim pages since our last + * laundering, or + * 2. we haven't yet reached the target of the current + * background laundering run. + * + * The background laundering threshold is not a constant. + * Instead, it is a slowly growing function of the number of + * page daemon wakeups since the last laundering. Thus, as the + * ratio of dirty to clean inactive pages grows, the amount of + * memory pressure required to trigger laundering decreases. + */ +trybackground: + nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; + ndirty = vm_cnt.v_laundry_count; + if (target == 0 && wakeups != last_launder && + ndirty * isqrt(wakeups - last_launder) >= nclean) { + target = vm_background_launder_target; + } + + /* + * We have a non-zero background laundering target. If we've + * laundered up to our maximum without observing a page daemon + * wakeup, just stop. This is a safety belt that ensures we + * don't launder an excessive amount if memory pressure is low + * and the ratio of dirty to clean pages is large. Otherwise, + * proceed at the background laundering rate. + */ + if (target > 0) { + if (wakeups != last_launder) { + last_launder = wakeups; + last_target = target; + } else if (last_target - target >= + vm_background_launder_max * PAGE_SIZE / 1024) { + target = 0; + } + launder = vm_background_launder_rate * PAGE_SIZE / 1024; + launder /= VM_LAUNDER_RATE; + if (launder > target) + launder = target; + } + +dolaundry: + if (launder > 0) { + /* + * Because of I/O clustering, the number of laundered + * pages could exceed "target" by the maximum size of + * a cluster minus one. + */ + target -= min(vm_pageout_launder(domain, launder, + in_shortfall), target); + pause("laundp", hz / VM_LAUNDER_RATE); + } + + /* + * If we're not currently laundering pages and the page daemon + * hasn't posted a new request, sleep until the page daemon + * kicks us. + */ + vm_pagequeue_lock(pq); + if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) + (void)mtx_sleep(&vm_laundry_request, + vm_pagequeue_lockptr(pq), PVM, "launds", 0); + + /* + * If the pagedaemon has indicated that it's in shortfall, start + * a shortfall laundering unless we're already in the middle of + * one. This may preempt a background laundering. + */ + if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && + (!in_shortfall || shortfall_cycle == 0)) { + shortfall = vm_laundry_target() + vm_pageout_deficit; + target = 0; + } else + shortfall = 0; + + if (target == 0) + vm_laundry_request = VM_LAUNDRY_IDLE; + vm_pagequeue_unlock(pq); + } +} + +/* * vm_pageout_scan does the dirty work for the pageout daemon. * - * pass 0 - Update active LRU/deactivate pages - * pass 1 - Free inactive pages - * pass 2 - Launder dirty pages + * pass == 0: Update active LRU/deactivate pages + * pass >= 1: Free inactive pages * * Returns true if pass was zero or enough pages were freed by the inactive * queue scan to meet the target. @@ -856,10 +1270,9 @@ struct vm_pagequeue *pq; vm_object_t object; long min_scan; - int act_delta, addl_page_shortage, deficit, error, inactq_shortage; - int maxlaunder, maxscan, page_shortage, scan_tick, scanned; - int starting_page_shortage, vnodes_skipped; - boolean_t pageout_ok, queue_locked; + int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; + int page_shortage, scan_tick, scanned, starting_page_shortage; + boolean_t queue_locked; /* * If we need to reclaim memory ask kernel caches to return @@ -901,23 +1314,6 @@ starting_page_shortage = page_shortage; /* - * maxlaunder limits the number of dirty pages we flush per scan. - * For most systems a smaller value (16 or 32) is more robust under - * extreme memory and disk pressure because any unnecessary writes - * to disk can result in extreme performance degredation. However, - * systems with excessive dirty pages (especially when MAP_NOSYNC is - * used) will die horribly with limited laundering. If the pageout - * daemon cannot clean enough pages in the first pass, we let it go - * all out in succeeding passes. - */ - if ((maxlaunder = vm_max_launder) <= 1) - maxlaunder = 1; - if (pass > 1) - maxlaunder = 10000; - - vnodes_skipped = 0; - - /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make @@ -932,7 +1328,7 @@ m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); - KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); + KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); @@ -995,11 +1391,15 @@ KASSERT(m->hold_count == 0, ("Held page %p", m)); /* - * We unlock the inactive page queue, invalidating the - * 'next' pointer. Use our marker to remember our - * place. + * Dequeue the inactive page and unlock the inactive page + * queue, invalidating the 'next' pointer. Dequeueing the + * page here avoids a later reacquisition (and release) of + * the inactive page queue lock when vm_page_activate(), + * vm_page_free(), or vm_page_launder() is called. Use a + * marker to remember our place in the inactive queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); + vm_page_dequeue_locked(m); vm_pagequeue_unlock(pq); queue_locked = FALSE; @@ -1028,6 +1428,7 @@ } if (act_delta != 0) { if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); vm_page_activate(m); /* @@ -1039,8 +1440,14 @@ */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; - } else if ((object->flags & OBJ_DEAD) == 0) - goto requeue_page; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_pagequeue_lock(pq); + queue_locked = TRUE; + m->queue = PQ_INACTIVE; + TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + vm_pagequeue_cnt_inc(pq); + goto drop_page; + } } /* @@ -1056,83 +1463,23 @@ pmap_remove_all(m); } + /* + * Clean pages can be freed, but dirty pages must be sent back + * to the laundry, unless they belong to a dead object. + * Requeueing dirty pages from dead objects is pointless, as + * they are being paged out and freed by the thread that + * destroyed the object. + */ if (m->dirty == 0) { - /* - * Clean pages can be freed. - */ free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; - } else if ((object->flags & OBJ_DEAD) != 0) { - /* - * Leave dirty pages from dead objects at the front of - * the queue. They are being paged out and freed by - * the thread that destroyed the object. They will - * leave the queue shortly after the scan finishes, so - * they should be discounted from the inactive count. - */ - addl_page_shortage++; - } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { - /* - * Dirty pages need to be paged out, but flushing - * a page is extremely expensive versus freeing - * a clean page. Rather then artificially limiting - * the number of pages we can flush, we instead give - * dirty pages extra priority on the inactive queue - * by forcing them to be cycled through the queue - * twice before being flushed, after which the - * (now clean) page will cycle through once more - * before being freed. This significantly extends - * the thrash point for a heavily loaded machine. - */ - m->flags |= PG_WINATCFLS; -requeue_page: - vm_pagequeue_lock(pq); - queue_locked = TRUE; - vm_page_requeue_locked(m); - } else if (maxlaunder > 0) { - /* - * We always want to try to flush some dirty pages if - * we encounter them, to keep the system stable. - * Normally this number is small, but under extreme - * pressure where there are insufficient clean pages - * on the inactive queue, we may have to go all out. - */ - - if (object->type != OBJT_SWAP && - object->type != OBJT_DEFAULT) - pageout_ok = TRUE; - else if (disable_swap_pageouts) - pageout_ok = FALSE; - else if (defer_swap_pageouts) - pageout_ok = vm_page_count_min(); - else - pageout_ok = TRUE; - if (!pageout_ok) - goto requeue_page; - error = vm_pageout_clean(m); - /* - * Decrement page_shortage on success to account for - * the (future) cleaned page. Otherwise we could wind - * up laundering or cleaning too many pages. - */ - if (error == 0) { - page_shortage--; - maxlaunder--; - } else if (error == EDEADLK) { - pageout_lock_miss++; - vnodes_skipped++; - } else if (error == EBUSY) { - addl_page_shortage++; - } - vm_page_lock_assert(m, MA_NOTOWNED); - goto relock_queue; - } + } else if ((object->flags & OBJ_DEAD) == 0) + vm_page_launder(m); drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); -relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = TRUE; @@ -1142,6 +1489,24 @@ } vm_pagequeue_unlock(pq); + /* + * Wake up the laundry thread so that it can perform any needed + * laundering. If we didn't meet our target, we're in shortfall and + * need to launder more aggressively. + */ + if (vm_laundry_request == VM_LAUNDRY_IDLE && + starting_page_shortage > 0) { + pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; + vm_pagequeue_lock(pq); + if (page_shortage > 0) { + vm_laundry_request = VM_LAUNDRY_SHORTFALL; + PCPU_INC(cnt.v_pdshortfalls); + } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) + vm_laundry_request = VM_LAUNDRY_BACKGROUND; + wakeup(&vm_laundry_request); + vm_pagequeue_unlock(pq); + } + #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't free the targeted number of @@ -1152,14 +1517,6 @@ #endif /* - * Wakeup the sync daemon if we skipped a vnode in a writeable object - * and we didn't free enough pages. - */ - if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - - vm_cnt.v_free_min) - (void)speedup_syncer(); - - /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ @@ -1167,10 +1524,20 @@ /* * Compute the number of pages we want to try to move from the - * active queue to the inactive queue. + * active queue to either the inactive or laundry queue. + * + * When scanning active pages, we make clean pages count more heavily + * towards the page shortage than dirty pages. This is because dirty + * pages must be laundered before they can be reused and thus have less + * utility when attempting to quickly alleviate a shortage. However, + * this weighting also causes the scan to deactivate dirty pages more + * more aggressively, improving the effectiveness of clustering and + * ensuring that they can eventually be reused. */ - inactq_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + + inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + + vm_cnt.v_laundry_count / act_scan_laundry_weight) + vm_paging_target() + deficit + addl_page_shortage; + page_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); @@ -1254,14 +1621,44 @@ m->act_count -= min(m->act_count, ACT_DECLINE); /* - * Move this page to the tail of the active or inactive + * Move this page to the tail of the active, inactive or laundry * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); - vm_page_deactivate(m); - inactq_shortage--; + + /* + * When not short for inactive pages, let dirty pages go + * through the inactive queue before moving to the + * laundry queues. This gives them some extra time to + * be reactivated, potentially avoiding an expensive + * pageout. During a page shortage, the inactive queue + * is necessarily small, so we may move dirty pages + * directly to the laundry queue. + */ + if (inactq_shortage <= 0) + vm_page_deactivate(m); + else { + /* + * Calling vm_page_test_dirty() here would + * require acquisition of the object's write + * lock. However, during a page shortage, + * directing dirty pages into the laundry + * queue is only an optimization and not a + * requirement. Therefore, we simply rely on + * the opportunistic updates to the page's + * dirty field by the pmap. + */ + if (m->dirty == 0) { + vm_page_deactivate(m); + inactq_shortage -= + act_scan_laundry_weight; + } else { + vm_page_launder(m); + inactq_shortage--; + } + } } else vm_page_requeue_locked(m); vm_page_unlock(m); @@ -1570,14 +1967,14 @@ * thread during the previous scan, which must have * been a level 0 scan, or vm_pageout_wanted was * already set and the scan failed to free enough - * pages. If we haven't yet performed a level >= 2 - * scan (unlimited dirty cleaning), then upgrade the - * level and scan again now. Otherwise, sleep a bit - * and try again later. + * pages. If we haven't yet performed a level >= 1 + * (page reclamation) scan, then increase the level + * and scan again now. Otherwise, sleep a bit and + * try again later. */ mtx_unlock(&vm_page_queue_free_mtx); - if (pass > 1) - pause("psleep", hz / 2); + if (pass >= 1) + pause("psleep", hz / VM_INACT_SCAN_RATE); pass++; } else { /* @@ -1648,6 +2045,14 @@ /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; + + /* + * Target amount of memory to move out of the laundry queue during a + * background laundering. This is proportional to the amount of system + * memory. + */ + vm_background_launder_target = (vm_cnt.v_free_target - + vm_cnt.v_free_min) / 10; } /* @@ -1662,6 +2067,10 @@ #endif swap_pager_swap_init(); + error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, + 0, 0, "laundry: dom0"); + if (error != 0) + panic("starting laundry for domain 0, error %d", error); #ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,