Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -162,7 +162,7 @@ vm_offset_t addr; vm_ooffset_t offset; vm_page_t m; - int pflags, tries; + int pflags, level; int i; size = round_page(size); @@ -172,16 +172,17 @@ pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { - tries = 0; + level = 0; retry: m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i), pflags, 1, low, high, PAGE_SIZE, 0, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); - if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { - vm_pageout_grow_cache(tries, low, high); + if (level < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { + vm_pageout_reclaim_contig(1, low, high, + PAGE_SIZE, 0, level); VM_OBJECT_WLOCK(object); - tries++; + level++; goto retry; } /* @@ -225,7 +226,7 @@ vm_offset_t addr, tmp; vm_ooffset_t offset; vm_page_t end_m, m; - int pflags, tries; + int pflags, level; size = round_page(size); if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) @@ -233,16 +234,17 @@ offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; VM_OBJECT_WLOCK(object); - tries = 0; + level = 0; retry: m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags, atop(size), low, high, alignment, boundary, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); - if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { - vm_pageout_grow_cache(tries, low, high); + if (level < ((flags & M_NOWAIT) != 0 ? 1 : 3)) { + vm_pageout_reclaim_contig(atop(size), low, high, + alignment, boundary, level); VM_OBJECT_WLOCK(object); - tries++; + level++; goto retry; } vmem_free(vmem, addr, size); Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -1887,6 +1887,7 @@ req_class = VM_ALLOC_SYSTEM; SLIST_INIT(&deferred_vdrop_list); + m_ret = NULL; mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && @@ -1894,37 +1895,29 @@ vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages)) { #if VM_NRESERVLEVEL > 0 -retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, low, high, alignment, boundary)) == NULL) #endif m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); - } else { + } + if (m_ret == NULL) { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } - if (m_ret != NULL) - for (m = m_ret; m < &m_ret[npages]; m++) { - drop = vm_page_alloc_init(m); - if (drop != NULL) { - /* - * Enqueue the vnode for deferred vdrop(). - */ - m->plinks.s.pv = drop; - SLIST_INSERT_HEAD(&deferred_vdrop_list, m, - plinks.s.ss); - } + for (m = m_ret; m < &m_ret[npages]; m++) { + drop = vm_page_alloc_init(m); + if (drop != NULL) { + /* + * Enqueue the vnode for deferred vdrop(). + */ + m->plinks.s.pv = drop; + SLIST_INSERT_HEAD(&deferred_vdrop_list, m, + plinks.s.ss); } - else { -#if VM_NRESERVLEVEL > 0 - if (vm_reserv_reclaim_contig(npages, low, high, alignment, - boundary)) - goto retry; -#endif } mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) Index: sys/vm/vm_pageout.h =================================================================== --- sys/vm/vm_pageout.h +++ sys/vm/vm_pageout.h @@ -102,7 +102,9 @@ #ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); -void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t); +int vm_pageout_reclaim_contig(u_long, vm_paddr_t, vm_paddr_t, u_long, + vm_paddr_t, int); +int vm_pageout_count_pages(vm_page_t, u_long, int); void vm_pageout_oom(int shortage); #endif #endif /* _VM_VM_PAGEOUT_H_ */ Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,8 @@ static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); +static boolean_t vm_pageout_candidate(vm_page_t); +static boolean_t vm_pageout_contig(vm_page_t p_start, u_long npages, int level); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); @@ -167,7 +170,6 @@ #endif static int vm_max_launder = 32; static int vm_pageout_update_period; -static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static int lowmem_ticks; @@ -206,9 +208,6 @@ CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif -SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, - CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); - SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); @@ -224,8 +223,7 @@ CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); -static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, - vm_paddr_t); +static boolean_t vm_pageout_launder(vm_page_t); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); @@ -332,6 +330,136 @@ } /* + * Attempts to acquire all of the necessary locks to launder a page and then + * calls down through the clustering layer to PUTPAGES. Waits only a short + * time for a vnode lock. Returns 0 on success and an errno otherwise. + * + * Requires the page and object lock on entry, releases both before return. + */ +static int +vm_pageout_launder(vm_page_t m) +{ + vm_object_t object; + struct vnode *vp; + struct mount *mp; + int lockmode; + int error; + + vm_page_lock_assert(m, MA_OWNED); + object = m->object; + VM_OBJECT_ASSERT_WLOCKED(object); + + /* + * We don't bother paging objects that are "dead". + * Those objects are in a "rundown" state. + */ + if ((object->flags & OBJ_DEAD) || (disable_swap_pageouts && + (object->type == OBJT_SWAP || object->type == OBJT_DEFAULT))) { + vm_page_unlock(m); + VM_OBJECT_WUNLOCK(object); + return (ENXIO); + } + + vp = NULL; + mp = NULL; + error = 0; + + /* + * The object is already known NOT to be dead. It + * is possible for the vget() to block the whole + * pageout daemon, but the new low-memory handling + * code should prevent it. + * + * The previous code skipped locked vnodes and, worse, + * reordered pages in the queue. This results in + * completely non-deterministic operation and, on a + * busy system, can lead to extremely non-optimal + * pageouts. For example, it can cause clean pages + * to be freed and dirty pages to be moved to the end + * of the queue. Since dirty pages are also moved to + * the end of the queue once-cleaned, this gives + * way too large a weighting to deferring the freeing + * of dirty pages. + * + * We can't wait forever for the vnode lock, we might + * deadlock due to a vn_read() getting stuck in + * vm_wait while holding this vnode. We skip the + * vnode if we can't get it in a reasonable amount + * of time. + */ + if (object->type == OBJT_VNODE) { + vm_page_unlock(m); + vp = object->handle; + if (vp->v_type == VREG && + vn_start_write(vp, &mp, V_NOWAIT) != 0) { + mp = NULL; + error = EDEADLK; + goto unlock; + } + KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); + vm_object_reference_locked(object); + VM_OBJECT_WUNLOCK(object); + lockmode = MNT_SHARED_WRITES(vp->v_mount) ? + LK_SHARED : LK_EXCLUSIVE; + if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { + vp = NULL; + error = EDEADLK; + goto unlock_mp; + } + VM_OBJECT_WLOCK(object); + vm_page_lock(m); + /* + * The page might have been moved to another + * queue during potential blocking in vget() + * above. The page might have been freed and + * reused for another vnode. + */ + if (m->queue != PQ_INACTIVE || m->object != object) { + vm_page_unlock(m); + error = ENXIO; + goto unlock; + } + + /* + * The page may have been busied during the + * blocking in vget(). We don't move the + * page back onto the end of the queue so that + * statistics are more correct if we don't. + * + * If the page has become held it might + * be undergoing I/O, so skip it + */ + if (vm_page_busied(m) || m->hold_count != 0) { + vm_page_unlock(m); + error = EBUSY; + goto unlock; + } + } + + /* + * If a page is dirty, then it is either being washed + * (but not yet cleaned) or it is still in the + * laundry. If it is still in the laundry, then we + * start the cleaning operation. + */ + if (vm_pageout_clean(m) == 0) + error = EIO; +unlock: + VM_OBJECT_WUNLOCK(object); + +unlock_mp: + vm_page_lock_assert(m, MA_NOTOWNED); + if (mp != NULL) { + if (vp != NULL) + vput(vp); + vm_object_deallocate(object); + vn_finished_write(mp); + } + + return (error); +} + +/* * vm_pageout_clean: * * Clean the page and remove it from the laundry. @@ -571,107 +699,178 @@ return (numpagedout); } +static __noinline boolean_t +vm_pageout_candidate(vm_page_t p) +{ + /* Can't pageout wired, busy, or held pages */ + if (p->wire_count || p->hold_count || + (p->oflags & VPO_UNMANAGED) != 0 || + vm_page_busied(p)) + return (FALSE); + return (TRUE); +} + +/* + * Count the number of pages requiring pageout processing to create + * a contiguous free block. + * + * The interpretation of level is as follows: + * 0: only inactive or free pages + * 1: include dirty pages + * 2: include active pages + * + * Held, busy, or wired pages cause a return of -1. + */ +int +vm_pageout_count_pages(vm_page_t p_start, u_long npages, int level) +{ + vm_page_t p; + int i, workpages; + + workpages = 0; + for (i = 0; i < npages; i++) { + /* + * All of these checks are done locklessly + * because the results can change as soon + * as we return. This is a best-effort + * interface. + */ + p = &p_start[i]; + if (!vm_pageout_candidate(p)) + return (-1); + /* Dirty is not coherent until we check pmap. */ + if (p->dirty) { + if (level < 1) + return (-1); + workpages++; + continue; + } + if (p->queue == PQ_ACTIVE) { + if (level < 2) + return (-1); + workpages++; + continue; + } + /* + * Only count INACTIVE and reservations against level 0. + */ + if (level > 0) + continue; + if (p->queue == PQ_INACTIVE) + workpages++; + else if (p->order == VM_NFREEORDER && p->queue == PQ_NONE) + workpages++; + } + + return (workpages); +} + static boolean_t -vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, - vm_paddr_t high) +vm_pageout_contig(vm_page_t p_start, u_long npages, int level) { - struct mount *mp; - struct vnode *vp; vm_object_t object; - vm_paddr_t pa; - vm_page_t m, m_tmp, next; - int lockmode; + vm_page_t p; + int i; - vm_pagequeue_lock(pq); - TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) { - if ((m->flags & PG_MARKER) != 0) - continue; - pa = VM_PAGE_TO_PHYS(m); - if (pa < low || pa + PAGE_SIZE > high) + if (level > 1) { + /* Scan and deactivate. */ + for (i = 0; i < npages; i++) { + p = &p_start[i]; + vm_page_lock(p); + if (!vm_pageout_candidate(p)) { + vm_page_unlock(p); + return (FALSE); + } + if (p->queue != PQ_ACTIVE) { + vm_page_unlock(p); + continue; + } + vm_page_deactivate(p); + vm_page_unlock(p); + } + } + /* Scan and free, cleaning if allowed. */ + for (i = 0; i < npages; i++) { + p = &p_start[i]; + object = p->object; + if (object == NULL) { + vm_page_lock(p); + if (object != NULL) { + vm_page_unlock(p); + i--; /* Retry */ + continue; + } + if (!vm_pageout_candidate(p)) { + vm_page_unlock(p); + return (FALSE); + } +#if VM_NRESERVLEVEL > 0 + if (p->order == VM_NFREEORDER && p->queue == PQ_NONE) { + mtx_lock(&vm_page_queue_free_mtx); + vm_page_unlock(p); + vm_reserv_reclaim_page(p); + mtx_unlock(&vm_page_queue_free_mtx); + } else +#endif + vm_page_unlock(p); continue; - if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { - vm_page_unlock(m); + } + VM_OBJECT_WLOCK(object); + if (object != p->object) { + VM_OBJECT_WUNLOCK(object); + i--; /* Retry this page. */ continue; } - object = m->object; - if ((!VM_OBJECT_TRYWLOCK(object) && - (!vm_pageout_fallback_object_lock(m, &next) || - m->hold_count != 0)) || vm_page_busied(m)) { - vm_page_unlock(m); + vm_page_lock(p); + if (!vm_pageout_candidate(p)) { + vm_page_unlock(p); + VM_OBJECT_WUNLOCK(object); + return (FALSE); + } + if (p->queue != PQ_INACTIVE) { + vm_page_unlock(p); VM_OBJECT_WUNLOCK(object); continue; } - vm_page_test_dirty(m); - if (m->dirty == 0 && object->ref_count != 0) - pmap_remove_all(m); - if (m->dirty != 0) { - vm_page_unlock(m); - if (tries == 0 || (object->flags & OBJ_DEAD) != 0) { - VM_OBJECT_WUNLOCK(object); - continue; - } - if (object->type == OBJT_VNODE) { - vm_pagequeue_unlock(pq); - vp = object->handle; - vm_object_reference_locked(object); - VM_OBJECT_WUNLOCK(object); - (void)vn_start_write(vp, &mp, V_WAIT); - lockmode = MNT_SHARED_WRITES(vp->v_mount) ? - LK_SHARED : LK_EXCLUSIVE; - vn_lock(vp, lockmode | LK_RETRY); - VM_OBJECT_WLOCK(object); - vm_object_page_clean(object, 0, 0, OBJPC_SYNC); - VM_OBJECT_WUNLOCK(object); - VOP_UNLOCK(vp, 0); - vm_object_deallocate(object); - vn_finished_write(mp); - return (TRUE); - } else if (object->type == OBJT_SWAP || - object->type == OBJT_DEFAULT) { - vm_pagequeue_unlock(pq); - m_tmp = m; - vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC, - 0, NULL, NULL); + pmap_remove_all(p); + if (p->dirty) { + if (level < 1) { + vm_page_unlock(p); VM_OBJECT_WUNLOCK(object); - return (TRUE); + return (FALSE); } + if (vm_pageout_launder(p)) + return (FALSE); } else { - /* - * Dequeue here to prevent lock recursion in - * vm_page_cache(). - */ - vm_page_dequeue_locked(m); - vm_page_cache(m); - vm_page_unlock(m); + vm_page_free(p); + vm_page_unlock(p); + VM_OBJECT_WUNLOCK(object); } - VM_OBJECT_WUNLOCK(object); } - vm_pagequeue_unlock(pq); - return (FALSE); + return (TRUE); } - /* - * Increase the number of cached pages. The specified value, "tries", - * determines which categories of pages are cached: + * Attempt to free a contiguous region of physical memory within the + * specified boundaries. * * 0: All clean, inactive pages within the specified physical address range - * are cached. Will not sleep. + * are freed. Will not sleep. * 1: The vm_lowmem handlers are called. All inactive pages within - * the specified physical address range are cached. May sleep. + * the specified physical address range are reclaimed. May sleep. * 2: The vm_lowmem handlers are called. All inactive and active pages - * within the specified physical address range are cached. May sleep. + * within the specified physical address range are freed. May sleep. */ -void -vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) +int +vm_pageout_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int level) { - int actl, actmax, inactl, inactmax, dom, initial_dom; - static int start_dom = 0; + vm_page_t p, lastp; - if (tries > 0) { + if (level > 0) { /* * Decrease registered cache sizes. The vm_lowmem handlers * may acquire locks and/or sleep, so they can only be invoked - * when "tries" is greater than zero. + * when "level" is greater than zero. */ SDT_PROBE0(vm, , , vm__lowmem_cache); EVENTHANDLER_INVOKE(vm_lowmem, 0); @@ -682,56 +881,25 @@ */ uma_reclaim(); } - - /* - * Make the next scan start on the next domain. - */ - initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; - - inactl = 0; - inactmax = vm_cnt.v_inactive_count; - actl = 0; - actmax = tries < 2 ? 0 : vm_cnt.v_active_count; - dom = initial_dom; - - /* - * Scan domains in round-robin order, first inactive queues, - * then active. Since domain usually owns large physically - * contiguous chunk of memory, it makes sense to completely - * exhaust one domain before switching to next, while growing - * the pool of contiguous physical pages. - * - * Do not even start launder a domain which cannot contain - * the specified address range, as indicated by segments - * constituting the domain. - */ -again: - if (inactl < inactmax) { - if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, - low, high) && - vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], - tries, low, high)) { - inactl++; - goto again; - } - if (++dom == vm_ndomains) - dom = 0; - if (dom != initial_dom) - goto again; - } - if (actl < actmax) { - if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, - low, high) && - vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], - tries, low, high)) { - actl++; - goto again; - } - if (++dom == vm_ndomains) - dom = 0; - if (dom != initial_dom) - goto again; + lastp = NULL; + for (;;) { + mtx_lock(&vm_page_queue_free_mtx); + p = vm_phys_reclaim_contig(npages, low, high, alignment, boundary, + level); + mtx_unlock(&vm_page_queue_free_mtx); + if (p == NULL) + break; + if (vm_pageout_contig(p, npages, level) == 0) + return (0); + /* + * Prevent looping if pageout_pages failed but the same + * group was selected again. + */ + if (p == lastp) + break; + lastp = p; } + return (ENOENT); } #if !defined(NO_SWAPPING) @@ -915,7 +1083,6 @@ int act_delta, addl_page_shortage, deficit, maxscan, page_shortage; int vnodes_skipped = 0; int maxlaunder; - int lockmode; boolean_t queues_locked; /* @@ -1148,153 +1315,25 @@ * pressure where there are insufficient clean pages * on the inactive queue, we may have to go all out. */ - int swap_pageouts_ok; - struct vnode *vp = NULL; - struct mount *mp = NULL; - - if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { - swap_pageouts_ok = 1; - } else { - swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); - swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && - vm_page_count_min()); - - } - - /* - * We don't bother paging objects that are "dead". - * Those objects are in a "rundown" state. - */ - if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { - vm_pagequeue_lock(pq); - vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - queues_locked = TRUE; - vm_page_requeue_locked(m); - goto relock_queues; - } - - /* - * The object is already known NOT to be dead. It - * is possible for the vget() to block the whole - * pageout daemon, but the new low-memory handling - * code should prevent it. - * - * The previous code skipped locked vnodes and, worse, - * reordered pages in the queue. This results in - * completely non-deterministic operation and, on a - * busy system, can lead to extremely non-optimal - * pageouts. For example, it can cause clean pages - * to be freed and dirty pages to be moved to the end - * of the queue. Since dirty pages are also moved to - * the end of the queue once-cleaned, this gives - * way too large a weighting to deferring the freeing - * of dirty pages. - * - * We can't wait forever for the vnode lock, we might - * deadlock due to a vn_read() getting stuck in - * vm_wait while holding this vnode. We skip the - * vnode if we can't get it in a reasonable amount - * of time. - */ - if (object->type == OBJT_VNODE) { - vm_page_unlock(m); - vp = object->handle; - if (vp->v_type == VREG && - vn_start_write(vp, &mp, V_NOWAIT) != 0) { - mp = NULL; - ++pageout_lock_miss; - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - goto unlock_and_continue; - } - KASSERT(mp != NULL, - ("vp %p with NULL v_mount", vp)); - vm_object_reference_locked(object); - VM_OBJECT_WUNLOCK(object); - lockmode = MNT_SHARED_WRITES(vp->v_mount) ? - LK_SHARED : LK_EXCLUSIVE; - if (vget(vp, lockmode | LK_TIMELOCK, - curthread)) { - VM_OBJECT_WLOCK(object); - ++pageout_lock_miss; - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - vp = NULL; - goto unlock_and_continue; - } - VM_OBJECT_WLOCK(object); - vm_page_lock(m); - vm_pagequeue_lock(pq); - queues_locked = TRUE; - /* - * The page might have been moved to another - * queue during potential blocking in vget() - * above. The page might have been freed and - * reused for another vnode. - */ - if (m->queue != PQ_INACTIVE || - m->object != object || - TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) { - vm_page_unlock(m); - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - goto unlock_and_continue; - } - - /* - * The page may have been busied during the - * blocking in vget(). We don't move the - * page back onto the end of the queue so that - * statistics are more correct if we don't. - */ - if (vm_page_busied(m)) { - vm_page_unlock(m); - addl_page_shortage++; - goto unlock_and_continue; - } - - /* - * If the page has become held it might - * be undergoing I/O, so skip it - */ - if (m->hold_count != 0) { - vm_page_unlock(m); - addl_page_shortage++; - if (object->flags & OBJ_MIGHTBEDIRTY) - vnodes_skipped++; - goto unlock_and_continue; - } - vm_pagequeue_unlock(pq); - queues_locked = FALSE; - } + boolean_t object_dirty; + int error; + object_dirty = (object->flags & OBJ_MIGHTBEDIRTY) != 0; + error = vm_pageout_launder(m); /* - * If a page is dirty, then it is either being washed - * (but not yet cleaned) or it is still in the - * laundry. If it is still in the laundry, then we - * start the cleaning operation. - * * decrement page_shortage on success to account for * the (future) cleaned page. Otherwise we could wind * up laundering or cleaning too many pages. */ - if (vm_pageout_clean(m) != 0) { + if (error == 0) { --page_shortage; --maxlaunder; - } -unlock_and_continue: - vm_page_lock_assert(m, MA_NOTOWNED); - VM_OBJECT_WUNLOCK(object); - if (mp != NULL) { - if (queues_locked) { - vm_pagequeue_unlock(pq); - queues_locked = FALSE; - } - if (vp != NULL) - vput(vp); - vm_object_deallocate(object); - vn_finished_write(mp); + } else if (error == EDEADLK) { + ++pageout_lock_miss; + if (object_dirty) + vnodes_skipped++; + } else if (error == EBUSY) { + addl_page_shortage++; } vm_page_lock_assert(m, MA_NOTOWNED); goto relock_queues; Index: sys/vm/vm_phys.h =================================================================== --- sys/vm/vm_phys.h +++ sys/vm/vm_phys.h @@ -72,6 +72,8 @@ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end); vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); +vm_page_t vm_phys_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int level); vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order); vm_page_t vm_phys_alloc_pages(int pool, int order); boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high); Index: sys/vm/vm_phys.c =================================================================== --- sys/vm/vm_phys.c +++ sys/vm/vm_phys.c @@ -66,6 +66,7 @@ #include #include #include +#include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); @@ -1213,6 +1214,85 @@ return (m_ret); } +/* + * Find a range of contiguous free pages that can be easily reclaimed + * with the set of properties matching those defined by + * vm_phys_alloc_contig(). + */ +vm_page_t +vm_phys_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, int level) +{ + struct vm_freelist *fl; + struct vm_phys_seg *seg; + vm_paddr_t pa, size; + vm_page_t m_ret, m_min; + u_long min_workpages, workpages; + int dom, domain, flind, oind, order, pind; + + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + size = npages << PAGE_SHIFT; + KASSERT(size != 0, + ("vm_phys_reclaim_contig: size must not be 0")); + KASSERT((alignment & (alignment - 1)) == 0, + ("vm_phys_reclaim_contig: alignment must be a power of 2")); + KASSERT((boundary & (boundary - 1)) == 0, + ("vm_phys_reclaim_contig: boundary must be a power of 2")); + /* Compute the queue that is the best fit for npages. */ + for (order = 0; (1 << order) < npages; order++); + order--; + m_min = NULL; + workpages = 0; + dom = 0; +restartdom: + domain = vm_rr_selectdomain(); + for (flind = 0; flind < vm_nfreelists; flind++) { + for (oind = min(order, VM_NFREEORDER-1); oind >= 0; oind--) { + for (pind = 0; pind < VM_NFREEPOOL; pind++) { + fl = &vm_phys_free_queues[domain][flind][pind][0]; + TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { + /* + * A free list may contain physical pages + * from one or more segments. + */ + seg = &vm_phys_segs[m_ret->segind]; + if (seg->start > high || + low >= seg->end) + continue; + + /* + * Determine if the blocks are within the given range, + * satisfy the given alignment, and do not cross the + * given boundary. + */ + pa = VM_PAGE_TO_PHYS(m_ret); + if (pa < low || + pa + size > high || + pa + size > seg->end || + (pa & (alignment - 1)) != 0 || + ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) + continue; + + workpages = vm_pageout_count_pages(&m_ret[1 << oind], + npages - (1 << oind), level); + /* Don't scan further if we found an easy match. */ + if (workpages == 0) + return (m_ret); + if (workpages != -1 && + (m_min == NULL || workpages < min_workpages)) { + m_min = m_ret; + min_workpages = workpages; + } + } + } + } + } + if (++dom < vm_ndomains) + goto restartdom; + return (m_min); +} + + #ifdef DDB /* * Show the number of physical pages in each of the free lists. Index: sys/vm/vm_reserv.h =================================================================== --- sys/vm/vm_reserv.h +++ sys/vm/vm_reserv.h @@ -55,8 +55,7 @@ void vm_reserv_init(void); int vm_reserv_level_iffullpop(vm_page_t m); boolean_t vm_reserv_reactivate_page(vm_page_t m); -boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, - vm_paddr_t high, u_long alignment, vm_paddr_t boundary); +boolean_t vm_reserv_reclaim_page(vm_page_t m); boolean_t vm_reserv_reclaim_inactive(void); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); Index: sys/vm/vm_reserv.c =================================================================== --- sys/vm/vm_reserv.c +++ sys/vm/vm_reserv.c @@ -908,104 +908,39 @@ } /* - * Breaks the reservation at the head of the partially-populated reservation - * queue, releasing its cached and free pages to the physical memory - * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. - * - * The free page queue lock must be held. + * Breaks the reservation holding the page as long as it is partially + * populated. */ boolean_t -vm_reserv_reclaim_inactive(void) +vm_reserv_reclaim_page(vm_page_t m) { + mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); vm_reserv_t rv; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) { - vm_reserv_reclaim(rv); - return (TRUE); - } - return (FALSE); + rv = vm_reserv_from_page(m); + if (rv->object == NULL || !rv->inpartpopq) + return (FALSE); + vm_reserv_reclaim(rv); + + return (TRUE); } /* - * Searches the partially-populated reservation queue for the least recently - * active reservation with unused pages, i.e., cached or free, that satisfy the - * given request for contiguous physical memory. If a satisfactory reservation - * is found, it is broken. Returns TRUE if a reservation is broken and FALSE - * otherwise. + * Breaks the reservation at the head of the partially-populated reservation + * queue, releasing its cached and free pages to the physical memory + * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t -vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary) +vm_reserv_reclaim_inactive(void) { - vm_paddr_t pa, size; vm_reserv_t rv; - int hi, i, lo, next_free; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - if (npages > VM_LEVEL_0_NPAGES - 1) - return (FALSE); - size = npages << PAGE_SHIFT; - TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) { - pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]); - if (pa + PAGE_SIZE - size < low) { - /* This entire reservation is too low; go to next. */ - continue; - } - pa = VM_PAGE_TO_PHYS(&rv->pages[0]); - if (pa + size > high) { - /* This entire reservation is too high; go to next. */ - continue; - } - if (pa < low) { - /* Start the search for free pages at "low". */ - i = (low - pa) / NBPOPMAP; - hi = (low - pa) % NBPOPMAP; - } else - i = hi = 0; - do { - /* Find the next free page. */ - lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); - while (lo == 0 && ++i < NPOPMAP) - lo = ffsl(~rv->popmap[i]); - if (i == NPOPMAP) - break; - /* Convert from ffsl() to ordinary bit numbering. */ - lo--; - next_free = NBPOPMAP * i + lo; - pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]); - KASSERT(pa >= low, - ("vm_reserv_reclaim_contig: pa is too low")); - if (pa + size > high) { - /* The rest of this reservation is too high. */ - break; - } else if ((pa & (alignment - 1)) != 0 || - ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { - /* Continue with this reservation. */ - hi = lo; - continue; - } - /* Find the next used page. */ - hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1)); - while (hi == 0 && ++i < NPOPMAP) { - if ((NBPOPMAP * i - next_free) * PAGE_SIZE >= - size) { - vm_reserv_reclaim(rv); - return (TRUE); - } - hi = ffsl(rv->popmap[i]); - } - /* Convert from ffsl() to ordinary bit numbering. */ - if (i != NPOPMAP) - hi--; - if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >= - size) { - vm_reserv_reclaim(rv); - return (TRUE); - } - } while (i < NPOPMAP); + if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) { + vm_reserv_reclaim(rv); + return (TRUE); } return (FALSE); }