Index: sys/amd64/include/vmparam.h =================================================================== --- sys/amd64/include/vmparam.h +++ sys/amd64/include/vmparam.h @@ -227,4 +227,10 @@ #define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */ +/* + * Use a fairly large batch size since we expect amd64 systems to have lots of + * memory. + */ +#define VM_BATCHQUEUE_SIZE 31 + #endif /* _MACHINE_VMPARAM_H_ */ Index: sys/kern/subr_witness.c =================================================================== --- sys/kern/subr_witness.c +++ sys/kern/subr_witness.c @@ -601,7 +601,6 @@ * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, - { "vm pagequeue", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { NULL, NULL }, @@ -611,11 +610,11 @@ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, - { "vm pagequeue", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, + { "vm pagequeue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -111,6 +111,7 @@ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ + u_int iosize; /* (c) Natural I/O size in bytes. */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -282,6 +282,7 @@ object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; + object->iosize = 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif @@ -720,14 +721,11 @@ vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; - struct mtx *mtx, *mtx1; - struct vm_pagequeue *pq, *pq1; - int dequeued; + struct mtx *mtx; VM_OBJECT_ASSERT_WLOCKED(object); mtx = NULL; - pq = NULL; /* * Free any remaining pageable pages. This also removes them from the @@ -737,60 +735,23 @@ */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); - if ((object->flags & OBJ_UNMANAGED) == 0) { + if ((object->flags & OBJ_UNMANAGED) == 0) /* * vm_page_free_prep() only needs the page * lock for managed pages. */ - mtx1 = vm_page_lockptr(p); - if (mtx1 != mtx) { - if (mtx != NULL) - mtx_unlock(mtx); - if (pq != NULL) { - vm_pagequeue_cnt_add(pq, dequeued); - vm_pagequeue_unlock(pq); - pq = NULL; - } - mtx = mtx1; - mtx_lock(mtx); - } - } + vm_page_change_lock(p, &mtx); p->object = NULL; if (p->wire_count != 0) - goto unlist; + continue; VM_CNT_INC(v_pfree); p->flags &= ~PG_ZERO; - if (p->queue != PQ_NONE) { - KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: " - "page %p is not queued", p)); - pq1 = vm_page_pagequeue(p); - if (pq != pq1) { - if (pq != NULL) { - vm_pagequeue_cnt_add(pq, dequeued); - vm_pagequeue_unlock(pq); - } - pq = pq1; - vm_pagequeue_lock(pq); - dequeued = 0; - } - p->queue = PQ_NONE; - TAILQ_REMOVE(&pq->pq_pl, p, plinks.q); - dequeued--; - } - if (vm_page_free_prep(p, true)) - continue; -unlist: - TAILQ_REMOVE(&object->memq, p, listq); - } - if (pq != NULL) { - vm_pagequeue_cnt_add(pq, dequeued); - vm_pagequeue_unlock(pq); + + vm_page_free(p); } if (mtx != NULL) mtx_unlock(mtx); - vm_page_free_phys_pglist(&object->memq); - /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were @@ -1973,7 +1934,6 @@ { vm_page_t p, next; struct mtx *mtx; - struct pglist pgl; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || @@ -1982,7 +1942,6 @@ if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); - TAILQ_INIT(&pgl); again: p = vm_page_find_least(object, start); mtx = NULL; @@ -2037,12 +1996,10 @@ if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); p->flags &= ~PG_ZERO; - if (vm_page_free_prep(p, false)) - TAILQ_INSERT_TAIL(&pgl, p, listq); + vm_page_free(p); } if (mtx != NULL) mtx_unlock(mtx); - vm_page_free_phys_pglist(&pgl); vm_object_pip_wakeup(object); } Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -94,7 +94,9 @@ * In general, operations on this structure's mutable fields are * synchronized using either one of or a combination of the lock on the * object that the page belongs to (O), the pool lock for the page (P), - * or the lock for either the free or paging queue (Q). If a field is + * the per-domain lock for the free queues (F), or the page's queue + * lock (Q). The queue lock for a page depends on the value of its + * queue field and described in detail below. If a field is * annotated below with two of these locks, then holding either lock is * sufficient for read access, but both locks are required for write * access. An annotation of (C) indicates that the field is immutable. @@ -143,6 +145,28 @@ * causing the thread to block. vm_page_sleep_if_busy() can be used to * sleep until the page's busy state changes, after which the caller * must re-lookup the page and re-evaluate its state. + * + * The queue field is the index of the page queue containing the + * page, or PQ_NONE if the page is not enqueued. The queue lock of a + * page is the page queue lock corresponding to the page queue index, + * or the page lock (P) for the page. To modify the queue field, the + * queue lock for the old value of the field must be held. It is + * invalid for a page's queue field to transition between two distinct + * page queue indices. That is, when updating the queue field, either + * the new value or the old value must be PQ_NONE. + * + * To avoid contention on page queue locks, page queue operations + * (enqueue, dequeue, requeue) are batched using per-CPU queues. + * A deferred operation is requested by inserting an entry into a + * batch queue; the entry is simply a pointer to the page, and the + * request type is encoded in the page's aflags field using the values + * in PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is + * crucial to this scheme since the processing of entries in a given + * batch queue may be deferred indefinitely. In particular, a page + * may be freed before its pending batch queue entries have been + * processed. The page lock (P) must be held to schedule a batched + * queue operation, and the page queue lock must be held in order to + * process batch queue entries for the page queue. */ #if PAGE_SIZE == 4096 @@ -174,7 +198,7 @@ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ - vm_paddr_t phys_addr; /* physical address of page */ + vm_paddr_t phys_addr; /* physical address of page (C) */ struct md_page md; /* machine dependent stuff */ u_int wire_count; /* wired down maps refs (P) */ volatile u_int busy_lock; /* busy owners lock */ @@ -182,11 +206,11 @@ uint16_t flags; /* page PG_* flags (P) */ uint8_t aflags; /* access is atomic */ uint8_t oflags; /* page VPO_* flags (O) */ - uint8_t queue; /* page queue index (P,Q) */ + uint8_t queue; /* page queue index (Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; /* vm_phys segment index (C) */ - uint8_t order; /* index of the buddy queue */ - uint8_t pool; /* vm_phys freepool index (Q) */ + uint8_t order; /* index of the buddy queue (F) */ + uint8_t pool; /* vm_phys freepool index (F) */ u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ @@ -314,10 +338,32 @@ * * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has * at least one executable mapping. It is not consumed by the MI VM layer. + * + * PGA_ENQUEUED is set and cleared when a page is inserted into or removed + * from a page queue, respectively. It determines whether the plinks.q field + * of the page is valid. To set or clear this flag, the queue lock for the + * page must be held: the page queue lock corresponding to the page's "queue" + * field if its value is not PQ_NONE, and the page lock otherwise. + * + * PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page + * queue, and cleared when the dequeue request is processed. A page may + * have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue + * is requested after the page is scheduled to be enqueued but before it is + * actually inserted into the page queue. The page lock must be held to set + * this flag, and the queue lock for the page must be held to clear it. + * + * PGA_REQUEUE is set when the page is scheduled to be requeued in its page + * queue. The page lock must be held to set this flag, and the queue lock + * for the page must be held to clear it. */ #define PGA_WRITEABLE 0x01 /* page may be mapped writeable */ #define PGA_REFERENCED 0x02 /* page has been referenced */ #define PGA_EXECUTABLE 0x04 /* page may be mapped executable */ +#define PGA_ENQUEUED 0x08 /* page is enqueued in a page queue */ +#define PGA_DEQUEUE 0x10 /* page is due to be dequeued */ +#define PGA_REQUEUE 0x20 /* page is due to be requeued */ + +#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE) /* * Page flags. If changed at any other time than page allocation or @@ -483,10 +529,10 @@ void vm_page_deactivate(vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); +void vm_page_dequeue_lazy(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); -void vm_page_free_phys_pglist(struct pglist *tq); -bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked); +bool vm_page_free_prep(vm_page_t m); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -131,13 +131,11 @@ extern void uma_startup(void *, int); extern int vmem_startup_count(void); -/* - * Associated with page of user-allocatable memory is a - * page structure. - */ - struct vm_domain vm_dom[MAXMEMDOM]; +static DPCPU_DEFINE(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); +static DPCPU_DEFINE(struct vm_batchqueue, noreuseq[MAXMEMDOM]); + struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; @@ -176,7 +174,7 @@ static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); -static void vm_page_enqueue(uint8_t queue, vm_page_t m); +static void vm_page_enqueue_lazy(vm_page_t m, uint8_t queue); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); @@ -186,6 +184,9 @@ vm_page_t m_run, vm_paddr_t high); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); +static int vm_page_import(void *arg, void **store, int cnt, int domain, + int flags); +static void vm_page_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); @@ -199,6 +200,33 @@ VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } +/* + * The cache page zone is initialized later since we need to be able to allocate + * pages before UMA is fully initialized. + */ +static void +vm_page_init_cache_zones(void *dummy __unused) +{ + struct vm_domain *vmd; + int i; + + for (i = 0; i < vm_ndomains; i++) { + vmd = VM_DOMAIN(i); + /* + * Don't allow the page cache to take up more than .25% of + * memory. + */ + if (vmd->vmd_page_count / 400 < 256 * mp_ncpus) + continue; + vmd->vmd_pgcache = uma_zcache_create("vm pgcache", + sizeof(struct vm_page), NULL, NULL, NULL, NULL, + vm_page_import, vm_page_release, vmd, + /* UMA_ZONE_NOBUCKETCACHE |*/ + UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); + } +} +SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); + /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT @@ -1786,6 +1814,7 @@ #if VM_NRESERVLEVEL > 0 found: #endif + vm_page_dequeue(m); vm_page_alloc_check(m); /* @@ -1982,8 +2011,10 @@ #if VM_NRESERVLEVEL > 0 found: #endif - for (m = m_ret; m < &m_ret[npages]; m++) + for (m = m_ret; m < &m_ret[npages]; m++) { + vm_page_dequeue(m); vm_page_alloc_check(m); + } /* * Initialize the pages. Only the PG_ZERO flag is inherited. @@ -2127,6 +2158,7 @@ goto again; return (NULL); } + vm_page_dequeue(m); vm_page_alloc_check(m); /* @@ -2150,6 +2182,51 @@ return (m); } +static int +vm_page_import(void *arg, void **store, int cnt, int domain, int flags) +{ + struct vm_domain *vmd; + vm_page_t m; + int i, j, n; + + vmd = arg; + domain = vmd->vmd_domain; + n = 64; /* Starting stride. */ + vm_domain_free_lock(vmd); + for (i = 0; i < cnt; i+=n) { + n = vm_phys_alloc_npages(domain, VM_FREELIST_DEFAULT, &m, + MIN(n, cnt-i)); + if (n == 0) + break; + if (!vm_domain_allocate(vmd, VM_ALLOC_NORMAL, n)) { + vm_phys_free_contig(m, n); + break; + } + for (j = 0; j < n; j++) + store[i+j] = m++; + } + vm_domain_free_unlock(vmd); + + return (i); +} + +static void +vm_page_release(void *arg, void **store, int cnt) +{ + struct vm_domain *vmd; + vm_page_t m; + int i; + + vmd = arg; + vm_domain_free_lock(vmd); + for (i = 0; i < cnt; i++) { + m = (vm_page_t)store[i]; + vm_phys_free_pages(m, 0); + } + vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, cnt); +} + #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ @@ -2274,7 +2351,8 @@ vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && - m->queue != PQ_NONE && !vm_page_busied(m)) { + m->queue != PQ_NONE && + (m->aflags & PGA_DEQUEUE) == 0 && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one @@ -2425,7 +2503,9 @@ error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; - else if (m->queue != PQ_NONE && !vm_page_busied(m)) { + else if (m->queue != PQ_NONE && + (m->aflags & PGA_DEQUEUE) == 0 && + !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); @@ -2485,7 +2565,8 @@ */ if (object->ref_count != 0) pmap_remove_all(m); - m_new->aflags = m->aflags; + m_new->aflags = m->aflags & + ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = m->oflags & VPO_NOSYNC; @@ -2497,7 +2578,7 @@ vm_page_remque(m); vm_page_replace_checked(m_new, object, m->pindex, m); - if (vm_page_free_prep(m, false)) + if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); @@ -2511,7 +2592,7 @@ m->flags &= ~PG_ZERO; vm_page_remque(m); vm_page_remove(m); - if (vm_page_free_prep(m, false)) + if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, @@ -2954,113 +3035,289 @@ return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]); } +static struct mtx * +vm_page_pagequeue_lockptr(vm_page_t m) +{ + + if (m->queue == PQ_NONE) + return (NULL); + return (&vm_page_pagequeue(m)->pq_mutex); +} + +static void +vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, + uint8_t queue) +{ + vm_page_t m; + int delta, i; + uint8_t aflags; + + vm_pagequeue_assert_locked(pq); + + delta = 0; + for (i = 0; i < bq->bq_cnt; i++) { + m = bq->bq_pa[i]; + if (__predict_false(m->queue != queue)) + continue; + + aflags = m->aflags; + if ((aflags & PGA_DEQUEUE) != 0) { + if (__predict_true((aflags & PGA_ENQUEUED) != 0)) { + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + delta--; + } + + /* + * Synchronize with the page daemon, which may be + * simultaneously scanning this page with only the page + * lock held. We must be careful to avoid leaving the + * page in a state where it appears to belong to a page + * queue. + */ + m->queue = PQ_NONE; + atomic_thread_fence_rel(); + vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); + } else if ((aflags & PGA_ENQUEUED) == 0) { + TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + delta++; + vm_page_aflag_set(m, PGA_ENQUEUED); + if ((aflags & PGA_REQUEUE) != 0) + vm_page_aflag_clear(m, PGA_REQUEUE); + } else if ((aflags & PGA_REQUEUE) != 0) { + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + vm_page_aflag_clear(m, PGA_REQUEUE); + } + } + vm_batchqueue_init(bq); + vm_pagequeue_cnt_add(pq, delta); +} + /* - * vm_page_dequeue: + * vm_page_dequeue_lazy: [ internal use only ] * - * Remove the given page from its current page queue. + * Request removal of the given page from its current page + * queue. Physical removal from the queue may be deferred + * arbitrarily, and may be cancelled by later queue operations + * on that page. * * The page must be locked. */ void -vm_page_dequeue(vm_page_t m) +vm_page_dequeue_lazy(vm_page_t m) { + struct vm_batchqueue *bq; struct vm_pagequeue *pq; + int domain, queue; vm_page_assert_locked(m); - KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", - m)); - pq = vm_page_pagequeue(m); - vm_pagequeue_lock(pq); - m->queue = PQ_NONE; - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - vm_pagequeue_cnt_dec(pq); + + queue = m->queue; + if (queue == PQ_NONE) + return; + domain = vm_phys_domain(m); + pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; + + vm_page_aflag_set(m, PGA_DEQUEUE); + + critical_enter(); + bq = DPCPU_PTR(pqbatch[domain][queue]); + if (vm_batchqueue_insert(bq, m)) { + critical_exit(); + return; + } + if (!vm_pagequeue_trylock(pq)) { + critical_exit(); + vm_pagequeue_lock(pq); + critical_enter(); + bq = DPCPU_PTR(pqbatch[domain][queue]); + } + vm_pqbatch_process(pq, bq, queue); + + /* + * The page may have been dequeued by another thread before we + * acquired the page queue lock. However, since we hold the + * page lock, the page's queue field cannot change a second + * time and we can safely clear PGA_DEQUEUE. + */ + KASSERT(m->queue == queue || m->queue == PQ_NONE, + ("%s: page %p migrated between queues", __func__, m)); + if (m->queue == queue) { + (void)vm_batchqueue_insert(bq, m); + vm_pqbatch_process(pq, bq, queue); + } else + vm_page_aflag_clear(m, PGA_DEQUEUE); vm_pagequeue_unlock(pq); + critical_exit(); } /* * vm_page_dequeue_locked: * - * Remove the given page from its current page queue. + * Remove the page from its page queue, which must be locked. + * If the page lock is not held, there is no guarantee that the + * page will not be enqueued by another thread before this function + * returns. In this case, it is up to the caller to ensure that + * no other threads hold a reference to the page. * - * The page and page queue must be locked. + * The page queue lock must be held. If the page is not already + * logically dequeued, the page lock must be held as well. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; - vm_page_lock_assert(m, MA_OWNED); - pq = vm_page_pagequeue(m); - vm_pagequeue_assert_locked(pq); + KASSERT(m->queue != PQ_NONE, + ("%s: page %p queue field is PQ_NONE", __func__, m)); + vm_pagequeue_assert_locked(vm_page_pagequeue(m)); + KASSERT((m->aflags & PGA_DEQUEUE) != 0 || + mtx_owned(vm_page_lockptr(m)), + ("%s: queued unlocked page %p", __func__, m)); + + if ((m->aflags & PGA_ENQUEUED) != 0) { + pq = vm_page_pagequeue(m); + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + vm_pagequeue_cnt_dec(pq); + } + + /* + * Synchronize with the page daemon, which may be simultaneously + * scanning this page with only the page lock held. We must be careful + * to avoid leaving the page in a state where it appears to belong to a + * page queue. + */ m->queue = PQ_NONE; - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - vm_pagequeue_cnt_dec(pq); + atomic_thread_fence_rel(); + vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); +} + +/* + * vm_page_dequeue: + * + * Remove the page from whichever page queue it's in, if any. + * If the page lock is not held, there is no guarantee that the + * page will not be enqueued by another thread before this function + * returns. In this case, it is up to the caller to ensure that + * no other threads hold a reference to the page. + */ +void +vm_page_dequeue(vm_page_t m) +{ + struct mtx *lock, *lock1; + + lock = vm_page_pagequeue_lockptr(m); + for (;;) { + if (lock == NULL) + return; + mtx_lock(lock); + if ((lock1 = vm_page_pagequeue_lockptr(m)) == lock) + break; + mtx_unlock(lock); + lock = lock1; + } + KASSERT(lock == vm_page_pagequeue_lockptr(m), + ("%s: page %p migrated directly between queues", __func__, m)); + vm_page_dequeue_locked(m); + mtx_unlock(lock); } /* - * vm_page_enqueue: + * vm_page_enqueue_lazy: * - * Add the given page to the specified page queue. + * Schedule the given page for insertion into the specified page queue. + * Physical insertion of the page may be deferred indefinitely. * * The page must be locked. */ static void -vm_page_enqueue(uint8_t queue, vm_page_t m) +vm_page_enqueue_lazy(vm_page_t m, uint8_t queue) { + struct vm_batchqueue *bq; struct vm_pagequeue *pq; + int domain; - vm_page_lock_assert(m, MA_OWNED); - KASSERT(queue < PQ_COUNT, - ("vm_page_enqueue: invalid queue %u request for page %p", - queue, m)); + vm_page_assert_locked(m); + KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, + ("%s: page %p is already enqueued", __func__, m)); + + domain = vm_phys_domain(m); pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; - vm_pagequeue_lock(pq); + + /* + * The queue field might be changed back to PQ_NONE by a concurrent + * call to vm_page_dequeue(). In that case the batch queue entry will + * be a no-op. + */ m->queue = queue; - TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - vm_pagequeue_cnt_inc(pq); + + critical_enter(); + bq = DPCPU_PTR(pqbatch[domain][queue]); + if (__predict_true(vm_batchqueue_insert(bq, m))) { + critical_exit(); + return; + } + if (!vm_pagequeue_trylock(pq)) { + critical_exit(); + vm_pagequeue_lock(pq); + critical_enter(); + bq = DPCPU_PTR(pqbatch[domain][queue]); + } + vm_pqbatch_process(pq, bq, queue); + (void)vm_batchqueue_insert(bq, m); + vm_pqbatch_process(pq, bq, queue); vm_pagequeue_unlock(pq); + critical_exit(); } /* * vm_page_requeue: * - * Move the given page to the tail of its current page queue. + * Schedule a requeue of the given page. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { + struct vm_batchqueue *bq; struct vm_pagequeue *pq; + int domain, queue; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, - ("vm_page_requeue: page %p is not queued", m)); + ("%s: page %p is not enqueued", __func__, m)); + + domain = vm_phys_domain(m); + queue = m->queue; pq = vm_page_pagequeue(m); - vm_pagequeue_lock(pq); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - vm_pagequeue_unlock(pq); -} -/* - * vm_page_requeue_locked: - * - * Move the given page to the tail of its current page queue. - * - * The page queue must be locked. - */ -void -vm_page_requeue_locked(vm_page_t m) -{ - struct vm_pagequeue *pq; + if (queue == PQ_NONE) + return; - KASSERT(m->queue != PQ_NONE, - ("vm_page_requeue_locked: page %p is not queued", m)); - pq = vm_page_pagequeue(m); - vm_pagequeue_assert_locked(pq); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + vm_page_aflag_set(m, PGA_REQUEUE); + critical_enter(); + bq = DPCPU_PTR(pqbatch[domain][queue]); + if (__predict_true(vm_batchqueue_insert(bq, m))) { + critical_exit(); + return; + } + if (!vm_pagequeue_trylock(pq)) { + critical_exit(); + vm_pagequeue_lock(pq); + critical_enter(); + bq = DPCPU_PTR(pqbatch[domain][queue]); + } + vm_pqbatch_process(pq, bq, queue); + KASSERT(m->queue == queue || m->queue == PQ_NONE, + ("%s: page %p migrated between queues", __func__, m)); + if (m->queue == queue) { + (void)vm_batchqueue_insert(bq, m); + vm_pqbatch_process(pq, bq, queue); + } else + vm_page_aflag_clear(m, PGA_REQUEUE); + vm_pagequeue_unlock(pq); + critical_exit(); } /* @@ -3078,18 +3335,18 @@ int queue; vm_page_lock_assert(m, MA_OWNED); - if ((queue = m->queue) != PQ_ACTIVE) { - if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { - if (m->act_count < ACT_INIT) - m->act_count = ACT_INIT; - if (queue != PQ_NONE) - vm_page_dequeue(m); - vm_page_enqueue(PQ_ACTIVE, m); - } - } else { - if (m->act_count < ACT_INIT) + + if ((queue = m->queue) == PQ_ACTIVE || m->wire_count > 0 || + (m->oflags & VPO_UNMANAGED) != 0) { + if (queue == PQ_ACTIVE && m->act_count < ACT_INIT) m->act_count = ACT_INIT; + return; } + + vm_page_remque(m); + if (m->act_count < ACT_INIT) + m->act_count = ACT_INIT; + vm_page_enqueue_lazy(m, PQ_ACTIVE); } /* @@ -3100,11 +3357,10 @@ * the page to the free list only if this function returns true. * * The object must be locked. The page must be locked if it is - * managed. For a queued managed page, the pagequeue_locked - * argument specifies whether the page queue is already locked. + * managed. */ bool -vm_page_free_prep(vm_page_t m, bool pagequeue_locked) +vm_page_free_prep(vm_page_t m) { #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) @@ -3120,14 +3376,14 @@ if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), - ("vm_page_free_toq: freeing mapped page %p", m)); + ("vm_page_free_prep: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, - ("vm_page_free_toq: unmanaged page %p is queued", m)); + ("vm_page_free_prep: unmanaged page %p is queued", m)); VM_CNT_INC(v_tfree); if (vm_page_sbusied(m)) - panic("vm_page_free: freeing busy page %p", m); + panic("vm_page_free_prep: freeing busy page %p", m); vm_page_remove(m); @@ -3143,21 +3399,23 @@ return (false); } - if (m->queue != PQ_NONE) { - if (pagequeue_locked) - vm_page_dequeue_locked(m); - else - vm_page_dequeue(m); - } + /* + * Pages need not be dequeued before they are returned to the physical + * memory allocator, but they must at least be marked for a deferred + * dequeue. + */ + if ((m->oflags & VPO_UNMANAGED) == 0) + vm_page_dequeue_lazy(m); + m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) - panic("vm_page_free: freeing wired page %p", m); + panic("vm_page_free_prep: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, - ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); + ("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; return (false); } @@ -3176,36 +3434,6 @@ return (true); } -void -vm_page_free_phys_pglist(struct pglist *tq) -{ - struct vm_domain *vmd; - vm_page_t m; - int cnt; - - if (TAILQ_EMPTY(tq)) - return; - vmd = NULL; - cnt = 0; - TAILQ_FOREACH(m, tq, listq) { - if (vmd != vm_pagequeue_domain(m)) { - if (vmd != NULL) { - vm_domain_free_unlock(vmd); - vm_domain_freecnt_inc(vmd, cnt); - cnt = 0; - } - vmd = vm_pagequeue_domain(m); - vm_domain_free_lock(vmd); - } - vm_phys_free_pages(m, 0); - cnt++; - } - if (vmd != NULL) { - vm_domain_free_unlock(vmd); - vm_domain_freecnt_inc(vmd, cnt); - } -} - /* * vm_page_free_toq: * @@ -3220,8 +3448,9 @@ { struct vm_domain *vmd; - if (!vm_page_free_prep(m, false)) + if (!vm_page_free_prep(m)) return; + vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); @@ -3243,23 +3472,18 @@ vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; - struct pglist pgl; int count; if (SLIST_EMPTY(free)) return; count = 0; - TAILQ_INIT(&pgl); while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); - if (vm_page_free_prep(m, false)) - TAILQ_INSERT_TAIL(&pgl, m, listq); + vm_page_free_toq(m); } - vm_page_free_phys_pglist(&pgl); - if (update_wire_count) vm_wire_sub(count); } @@ -3318,22 +3542,25 @@ KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); + if ((m->oflags & VPO_UNMANAGED) == 0) + vm_page_assert_locked(m); unwired = vm_page_unwire_noq(m); - if (unwired && (m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL) { - if (m->queue == queue) { + if (!unwired || (m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL) + return (unwired); + + if (m->queue == queue) { + if (queue == PQ_ACTIVE) + vm_page_reference(m); + else if (queue != PQ_NONE) + vm_page_requeue(m); + } else { + vm_page_dequeue(m); + if (queue != PQ_NONE) { + vm_page_enqueue_lazy(m, queue); if (queue == PQ_ACTIVE) - vm_page_reference(m); - else if (queue != PQ_NONE) - vm_page_requeue(m); - } else { - vm_page_remque(m); - if (queue != PQ_NONE) { - vm_page_enqueue(queue, m); - if (queue == PQ_ACTIVE) - /* Initialize act_count. */ - vm_page_activate(m); - } + /* Initialize act_count. */ + vm_page_activate(m); } } return (unwired); @@ -3369,73 +3596,85 @@ } /* - * Move the specified page to the inactive queue, or requeue the page if it is - * already in the inactive queue. - * - * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive - * queue. However, setting "noreuse" to TRUE will accelerate the specified - * page's reclamation, but it will not unmap the page from any address space. - * This is implemented by inserting the page near the head of the inactive - * queue, using a marker page to guide FIFO insertion ordering. + * Move the specified page to the tail of the inactive queue, or requeue + * the page if it is already in the inactive queue. * * The page must be locked. */ -static inline void -_vm_page_deactivate(vm_page_t m, boolean_t noreuse) +void +vm_page_deactivate(vm_page_t m) { - struct vm_pagequeue *pq; - int queue; vm_page_assert_locked(m); - if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { - pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE]; - /* Avoid multiple acquisitions of the inactive queue lock. */ - queue = m->queue; - if (queue == PQ_INACTIVE) { - vm_pagequeue_lock(pq); - vm_page_dequeue_locked(m); - } else { - if (queue != PQ_NONE) - vm_page_dequeue(m); - vm_pagequeue_lock(pq); - } - m->queue = PQ_INACTIVE; - if (noreuse) - TAILQ_INSERT_BEFORE( - &vm_pagequeue_domain(m)->vmd_inacthead, m, - plinks.q); - else - TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - vm_pagequeue_cnt_inc(pq); - vm_pagequeue_unlock(pq); - } -} - -/* - * Move the specified page to the inactive queue, or requeue the page if it is - * already in the inactive queue. - * - * The page must be locked. - */ -void -vm_page_deactivate(vm_page_t m) -{ + if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0) + return; - _vm_page_deactivate(m, FALSE); + if (!vm_page_inactive(m)) { + vm_page_remque(m); + vm_page_enqueue_lazy(m, PQ_INACTIVE); + } else + vm_page_requeue(m); } /* - * Move the specified page to the inactive queue with the expectation - * that it is unlikely to be reused. + * Move the specified page close to the head of the inactive queue, + * bypassing LRU. A marker page is used to maintain FIFO ordering. + * As with regular enqueues, we use a per-CPU batch queue to reduce + * contention on the page queue lock. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { + struct vm_batchqueue *bq; + struct vm_domain *vmd; + struct vm_pagequeue *pq; + vm_page_t marker; + int domain; + + vm_page_assert_locked(m); + + if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0) + return; + + domain = vm_phys_domain(m); + vmd = VM_DOMAIN(domain); + pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; + + if (!vm_page_inactive(m)) + vm_page_remque(m); - _vm_page_deactivate(m, TRUE); + m->queue = PQ_INACTIVE; + + critical_enter(); + bq = DPCPU_PTR(noreuseq[domain]); + if (vm_batchqueue_insert(bq, m)) { + critical_exit(); + return; + } + if (!vm_pagequeue_trylock(pq)) { + critical_exit(); + vm_pagequeue_lock(pq); + critical_enter(); + bq = DPCPU_PTR(noreuseq[domain]); + } + marker = &vmd->vmd_inacthead; + do { + if (m->queue != PQ_INACTIVE) + continue; + if ((m->aflags & PGA_ENQUEUED) != 0) + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + else { + vm_page_aflag_set(m, PGA_ENQUEUED); + vm_pagequeue_cnt_inc(pq); + } + TAILQ_INSERT_BEFORE(marker, m, plinks.q); + } while ((m = vm_batchqueue_pop(bq)) != NULL); + vm_pagequeue_unlock(pq); + vm_batchqueue_init(bq); + critical_exit(); } /* @@ -3448,13 +3687,14 @@ { vm_page_assert_locked(m); - if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { - if (m->queue == PQ_LAUNDRY) - vm_page_requeue(m); - else { - vm_page_remque(m); - vm_page_enqueue(PQ_LAUNDRY, m); - } + if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0) + return; + + if (m->queue == PQ_LAUNDRY) + vm_page_requeue(m); + else { + vm_page_remque(m); + vm_page_enqueue_lazy(m, PQ_LAUNDRY); } } @@ -3470,9 +3710,9 @@ vm_page_assert_locked(m); KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); - if (m->queue != PQ_NONE) - vm_page_dequeue(m); - vm_page_enqueue(PQ_UNSWAPPABLE, m); + + vm_page_remque(m); + vm_page_enqueue_lazy(m, PQ_UNSWAPPABLE); } /* Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -201,11 +201,17 @@ CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static u_int isqrt(u_int num); -static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); -static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); + +struct scan_state { + struct vm_batchqueue bq; + struct vm_pagequeue *pq; + vm_page_t marker; + int maxscan; + int scanned; +}; /* * Initialize a dummy page for marking the caller's place in the specified @@ -225,96 +231,157 @@ } /* - * vm_pageout_fallback_object_lock: - * - * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is - * known to have failed and page queue must be either PQ_ACTIVE or - * PQ_INACTIVE. To avoid lock order violation, unlock the page queue - * while locking the vm object. Use marker page to detect page queue - * changes and maintain notion of next page on page queue. Return - * TRUE if no changes were detected, FALSE otherwise. vm object is - * locked on return. - * - * This function depends on both the lock portion of struct vm_object - * and normal struct vm_page being type stable. + * Initialize and enqueue static queue markers. */ -static boolean_t -vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) +static void +vm_pageout_insert_markers(struct vm_domain *vmd) { - struct vm_page marker; - struct vm_pagequeue *pq; - boolean_t unchanged; - u_short queue; - vm_object_t object; + vm_page_t marker; + int i; - queue = m->queue; - vm_pageout_init_marker(&marker, queue); - pq = vm_page_pagequeue(m); - object = m->object; - - TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); - vm_pagequeue_unlock(pq); - vm_page_unlock(m); - VM_OBJECT_WLOCK(object); - vm_page_lock(m); - vm_pagequeue_lock(pq); + /* + * inacthead is used to provide FIFO ordering for LRU-bypassing + * insertions. + */ + marker = &vmd->vmd_inacthead; + vm_pageout_init_marker(marker, PQ_INACTIVE); + TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, marker, + plinks.q); + vm_page_aflag_set(marker, PGA_ENQUEUED); /* - * The page's object might have changed, and/or the page might - * have moved from its original position in the queue. If the - * page's object has changed, then the caller should abandon - * processing the page because the wrong object lock was - * acquired. Use the marker's plinks.q, not the page's, to - * determine if the page has been moved. The state of the - * page's plinks.q can be indeterminate; whereas, the marker's - * plinks.q must be valid. + * The clock pages are used to implement active queue scanning without + * requeues. Scans start at clock[0], which is advanced after the scan + * ends. When the two clock hands meet, they are reset and scanning + * resumes from the head of the queue. */ - *next = TAILQ_NEXT(&marker, plinks.q); - unchanged = m->object == object && - m == TAILQ_PREV(&marker, pglist, plinks.q); - KASSERT(!unchanged || m->queue == queue, - ("page %p queue %d %d", m, queue, m->queue)); - TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); - return (unchanged); + marker = &vmd->vmd_clock[0]; + vm_pageout_init_marker(marker, PQ_ACTIVE); + TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, marker, + plinks.q); + vm_page_aflag_set(marker, PGA_ENQUEUED); + marker = &vmd->vmd_clock[1]; + vm_pageout_init_marker(marker, PQ_ACTIVE); + TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, marker, + plinks.q); + vm_page_aflag_set(marker, PGA_ENQUEUED); + + for (i = 0; i < PQ_COUNT; i++) + vm_pageout_init_marker(&vmd->vmd_markers[i], i); +} + +static void +vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, + vm_page_t marker, int maxscan) +{ + + vm_pagequeue_assert_locked(pq); + + if ((marker->aflags & PGA_ENQUEUED) == 0) { + TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q); + vm_page_aflag_set(marker, PGA_ENQUEUED); + } + vm_batchqueue_init(&ss->bq); + ss->pq = pq; + ss->marker = marker; + ss->maxscan = maxscan; + ss->scanned = 0; + vm_pagequeue_unlock(pq); +} + +static void +vm_pageout_end_scan(struct scan_state *ss) +{ + struct vm_pagequeue *pq; + + pq = ss->pq; + vm_pagequeue_assert_locked(pq); + KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0, + ("marker %p not enqueued", ss->marker)); + + if ((ss->marker->aflags & PGA_ENQUEUED) != 0) { + TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); + vm_page_aflag_clear(ss->marker, PGA_ENQUEUED); + } + VM_CNT_ADD(v_pdpages, ss->scanned); +} + +static inline bool +vm_pageout_page_queued(vm_page_t m, int queue) +{ + + vm_page_assert_locked(m); + + if ((m->aflags & PGA_DEQUEUE) != 0) + return (false); + atomic_thread_fence_acq(); + return (m->queue == queue); } /* - * Lock the page while holding the page queue lock. Use marker page - * to detect page queue changes and maintain notion of next page on - * page queue. Return TRUE if no changes were detected, FALSE - * otherwise. The page is locked on return. The page queue lock might - * be dropped and reacquired. + * Add a small number of queued pages to a batch queue for later processing + * without the corresponding queue lock held. The caller must have enqueued a + * marker page at the desired start point for the scan. * - * This function depends on normal struct vm_page being type stable. + * When processing the batch queue, vm_pageout_page_queued() must be used to + * determine whether the page was logically dequeued by another thread. Once + * this check is performed, the page lock guarantees that the page will not be + * disassociated from the queue. */ -static boolean_t -vm_pageout_page_lock(vm_page_t m, vm_page_t *next) +static inline void +vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) { - struct vm_page marker; struct vm_pagequeue *pq; - boolean_t unchanged; - u_short queue; + vm_page_t m, marker; - vm_page_lock_assert(m, MA_NOTOWNED); - if (vm_page_trylock(m)) - return (TRUE); + KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0, + ("marker %p not enqueued", ss->marker)); - queue = m->queue; - vm_pageout_init_marker(&marker, queue); - pq = vm_page_pagequeue(m); + marker = ss->marker; + pq = ss->pq; - TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); - vm_pagequeue_unlock(pq); - vm_page_lock(m); vm_pagequeue_lock(pq); + for (m = TAILQ_NEXT(marker, plinks.q); m != NULL && + ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE; + m = TAILQ_NEXT(m, plinks.q), ss->scanned++) { + if ((m->flags & PG_MARKER) == 0) { + KASSERT((m->aflags & PGA_ENQUEUED) != 0, + ("page %p not enqueued", m)); + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("Fictitious page %p cannot be in page queue", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("Unmanaged page %p cannot be in page queue", m)); + } else if (dequeue) + continue; + + (void)vm_batchqueue_insert(&ss->bq, m); + if (dequeue) { + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + vm_page_aflag_clear(m, PGA_ENQUEUED); + } + } + TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q); + if (__predict_true(m != NULL)) + TAILQ_INSERT_BEFORE(m, marker, plinks.q); + else + TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q); + if (dequeue) + vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt); + vm_pagequeue_unlock(pq); +} + +/* XXX */ +static inline vm_page_t +vm_pageout_next(struct scan_state *ss, const bool dequeue) +{ + vm_page_t m; - /* Page queue might have changed. */ - *next = TAILQ_NEXT(&marker, plinks.q); - unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); - KASSERT(!unchanged || m->queue == queue, - ("page %p queue %d %d", m, queue, m->queue)); - TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); - return (unchanged); + m = vm_batchqueue_pop(&ss->bq); + if (m == NULL) { + vm_pageout_collect_batch(ss, dequeue); + m = vm_batchqueue_pop(&ss->bq); + } + return (m); } /* @@ -370,12 +437,12 @@ break; } vm_page_test_dirty(p); - if (p->dirty == 0) { + if (p->dirty == 0 || !vm_page_in_laundry(p)) { ib = 0; break; } vm_page_lock(p); - if (!vm_page_in_laundry(p) || vm_page_held(p)) { + if (vm_page_held(p)) { vm_page_unlock(p); ib = 0; break; @@ -398,10 +465,10 @@ if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); - if (p->dirty == 0) + if (p->dirty == 0 || !vm_page_in_laundry(p)) break; vm_page_lock(p); - if (!vm_page_in_laundry(p) || vm_page_held(p)) { + if (vm_page_held(p)) { vm_page_unlock(p); break; } @@ -692,13 +759,18 @@ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { + struct scan_state ss; struct vm_pagequeue *pq; + struct mtx *mtx; vm_object_t object; - vm_page_t m, next; - int act_delta, error, maxscan, numpagedout, starting_target; + vm_page_t m, marker; + int act_delta, error, numpagedout, queue, starting_target; int vnodes_skipped; - bool pageout_ok, queue_locked; + bool obj_locked, pageout_ok; + mtx = NULL; + obj_locked = false; + object = NULL; starting_target = launder; vnodes_skipped = 0; @@ -711,61 +783,78 @@ * maxscan ensures that we don't re-examine requeued pages. Any * additional pages written as part of a cluster are subtracted from * maxscan since they must be taken from the laundry queue. + * XXX * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) - pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]; + queue = PQ_UNSWAPPABLE; else - pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + queue = PQ_LAUNDRY; + marker = &vmd->vmd_markers[queue]; + pq = &vmd->vmd_pagequeues[queue]; scan: vm_pagequeue_lock(pq); - maxscan = pq->pq_cnt; - queue_locked = true; - for (m = TAILQ_FIRST(&pq->pq_pl); - m != NULL && maxscan-- > 0 && launder > 0; - m = next) { - vm_pagequeue_assert_locked(pq); - KASSERT(queue_locked, ("unlocked laundry queue")); - KASSERT(vm_page_in_laundry(m), - ("page %p has an inconsistent queue", m)); - next = TAILQ_NEXT(m, plinks.q); - if ((m->flags & PG_MARKER) != 0) + vm_pageout_init_scan(&ss, pq, marker, pq->pq_cnt); + while ((m = vm_pageout_next(&ss, false)) != NULL) { + if (__predict_false((m->flags & PG_MARKER) != 0)) continue; - KASSERT((m->flags & PG_FICTITIOUS) == 0, - ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); - if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { - vm_page_unlock(m); + + vm_page_change_lock(m, &mtx); + +recheck: + /* + * The page may have been disassociated from the queue + * while locks were dropped. + */ + if (!vm_pageout_page_queued(m, queue)) + continue; + + /* + * A requeue was requested, so this page gets a second + * chance. + */ + if ((m->aflags & PGA_REQUEUE) != 0) { + vm_page_requeue(m); continue; } + + /* + * Held pages are essentially stuck in the queue. + * + * Wired pages may not be freed. Complete their removal + * from the queue now to avoid needless revisits during + * future scans. + */ + if (m->hold_count != 0) + continue; if (m->wire_count != 0) { - vm_page_dequeue_locked(m); - vm_page_unlock(m); + vm_page_dequeue_lazy(m); continue; } - object = m->object; - if ((!VM_OBJECT_TRYWLOCK(object) && - (!vm_pageout_fallback_object_lock(m, &next) || - vm_page_held(m))) || vm_page_busied(m)) { - VM_OBJECT_WUNLOCK(object); - if (m->wire_count != 0 && vm_page_pagequeue(m) == pq) - vm_page_dequeue_locked(m); - vm_page_unlock(m); - continue; + + if (object != m->object) { + if (obj_locked) { + VM_OBJECT_WUNLOCK(object); + obj_locked = false; + } + object = m->object; + } + if (!obj_locked) { + if (!VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(mtx); + VM_OBJECT_WLOCK(object); + obj_locked = true; + mtx_lock(mtx); + goto recheck; + } else + obj_locked = true; } - /* - * Unlock the laundry queue, invalidating the 'next' pointer. - * Use a marker to remember our place in the laundry queue. - */ - TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, - plinks.q); - vm_pagequeue_unlock(pq); - queue_locked = false; + if (vm_page_busied(m)) + continue; /* * Invalid pages can be easily freed. They cannot be @@ -814,9 +903,11 @@ */ if (!in_shortfall) launder--; - goto drop_page; - } else if ((object->flags & OBJ_DEAD) == 0) - goto requeue_page; + continue; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_requeue(m); + continue; + } } /* @@ -851,11 +942,8 @@ else pageout_ok = true; if (!pageout_ok) { -requeue_page: - vm_pagequeue_lock(pq); - queue_locked = true; - vm_page_requeue_locked(m); - goto drop_page; + vm_page_requeue(m); + continue; } /* @@ -874,28 +962,31 @@ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; - maxscan -= numpagedout - 1; + ss.scanned += numpagedout; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } - goto relock_queue; + mtx = NULL; + obj_locked = false; } -drop_page: - vm_page_unlock(m); + } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; + } + if (obj_locked) { VM_OBJECT_WUNLOCK(object); -relock_queue: - if (!queue_locked) { - vm_pagequeue_lock(pq); - queue_locked = true; - } - next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); + obj_locked = false; } + vm_pagequeue_lock(pq); + vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) { - pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + queue = PQ_LAUNDRY; + marker = &vmd->vmd_markers[queue]; + pq = &vmd->vmd_pagequeues[queue]; goto scan; } @@ -951,7 +1042,6 @@ vmd = VM_DOMAIN(domain); pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(vmd->vmd_segs != 0, ("domain without segments")); - vm_pageout_init_marker(&vmd->vmd_laundry_marker, PQ_LAUNDRY); shortfall = 0; in_shortfall = false; @@ -1091,6 +1181,68 @@ } } +static int +vm_pageout_free_pages(vm_object_t object, vm_page_t m, struct mtx **mtxp) +{ + vm_page_t p, pp; + vm_pindex_t start; + int pcount, count; + + pcount = MAX(object->iosize / PAGE_SIZE, 1); + count = 1; + if (pcount == 1) { + vm_page_free(m); + goto out; + } + + /* Find the first page in the block. */ + start = m->pindex - (m->pindex % pcount); + for (p = m; p->pindex > start && (pp = vm_page_prev(p)) != NULL; + p = pp); + + /* Free the original page so we don't validate it twice. */ + if (p == m) + p = vm_page_next(m); + vm_page_free(m); + /* Iterate through the block range and free compatible pages. */ + for (m = p; m != NULL; m = p) { + /* Don't cache miss for the next page after the tail. */ + if (m->pindex < start + pcount) + p = TAILQ_NEXT(m, listq); + else + p = NULL; + vm_page_change_lock(m, mtxp); + if (vm_page_held(m) || vm_page_busied(m) || + m->queue != PQ_INACTIVE) + continue; + if (m->valid == 0) + goto free_page; + if ((m->aflags & PGA_REFERENCED) != 0) + continue; + if (object->ref_count != 0) { + if (pmap_ts_referenced(m)) { + vm_page_aflag_set(m, PGA_REFERENCED); + continue; + } + vm_page_test_dirty(m); + if (m->dirty == 0) + pmap_remove_all(m); + } + if (m->dirty) { + if ((object->flags & OBJ_DEAD) == 0) + vm_page_launder(m); + continue; + } +free_page: + vm_page_free(m); + count++; + } +out: + VM_CNT_ADD(v_dfree, count); + + return (count); +} + /* * vm_pageout_scan does the dirty work for the pageout daemon. * @@ -1103,13 +1255,16 @@ static bool vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage) { - vm_page_t m, next; + struct scan_state ss; + struct vm_batchqueue rq; + struct mtx *mtx; + vm_page_t m, marker; struct vm_pagequeue *pq; vm_object_t object; long min_scan; - int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; - int page_shortage, scan_tick, scanned, starting_page_shortage; - boolean_t queue_locked; + int act_delta, addl_page_shortage, deficit, inactq_shortage; + int page_shortage, scan_tick, starting_page_shortage; + bool obj_locked; /* * If we need to reclaim memory ask kernel caches to return @@ -1150,78 +1305,82 @@ page_shortage = deficit = 0; starting_page_shortage = page_shortage; + mtx = NULL; + obj_locked = false; + object = NULL; + vm_batchqueue_init(&rq); + /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ + marker = &vmd->vmd_markers[PQ_INACTIVE]; pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; - maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); - queue_locked = TRUE; - for (m = TAILQ_FIRST(&pq->pq_pl); - m != NULL && maxscan-- > 0 && page_shortage > 0; - m = next) { - vm_pagequeue_assert_locked(pq); - KASSERT(queue_locked, ("unlocked inactive queue")); - KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); + vm_pageout_init_scan(&ss, pq, marker, min(pq->pq_cnt, page_shortage)); + while ((m = vm_pageout_next(&ss, true)) != NULL) { + if (__predict_false((m->flags & PG_MARKER) != 0)) + continue; - VM_CNT_INC(v_pdpages); - next = TAILQ_NEXT(m, plinks.q); + vm_page_change_lock(m, &mtx); +recheck: /* - * skip marker pages + * The page may have been disassociated from the queue + * while locks were dropped. */ - if (m->flags & PG_MARKER) + if (!vm_pageout_page_queued(m, PQ_INACTIVE)) { + addl_page_shortage++; continue; + } - KASSERT((m->flags & PG_FICTITIOUS) == 0, - ("Fictitious page %p cannot be in inactive queue", m)); - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("Unmanaged page %p cannot be in inactive queue", m)); + /* + * A requeue was requested, so this page gets a second + * chance. + */ + if ((m->aflags & PGA_REQUEUE) != 0) + goto reinsert; /* - * The page or object lock acquisitions fail if the - * page was removed from the queue or moved to a - * different position within the queue. In either - * case, addl_page_shortage should not be incremented. + * Held pages are essentially stuck in the queue. So, + * they ought to be discounted from the inactive count. + * See the calculation of inactq_shortage before the + * loop over the active queue below. + * + * Wired pages may not be freed. Complete their removal + * from the queue now to avoid needless revisits during + * future scans. */ - if (!vm_pageout_page_lock(m, &next)) - goto unlock_page; - else if (m->wire_count != 0) { - /* - * Wired pages may not be freed, and unwiring a queued - * page will cause it to be requeued. Thus, remove them - * from the queue now to avoid unnecessary revisits. - */ - vm_page_dequeue_locked(m); + if (m->hold_count != 0) { addl_page_shortage++; - goto unlock_page; - } else if (m->hold_count != 0) { - /* - * Held pages are essentially stuck in the - * queue. So, they ought to be discounted - * from the inactive count. See the - * calculation of inactq_shortage before the - * loop over the active queue below. - */ + goto reinsert; + } + if (m->wire_count != 0) { addl_page_shortage++; - goto unlock_page; + vm_page_dequeue_lazy(m); + continue; } - object = m->object; - if (!VM_OBJECT_TRYWLOCK(object)) { - if (!vm_pageout_fallback_object_lock(m, &next)) - goto unlock_object; - else if (m->wire_count != 0) { - vm_page_dequeue_locked(m); - addl_page_shortage++; - goto unlock_object; - } else if (m->hold_count != 0) { - addl_page_shortage++; - goto unlock_object; + + if (object != m->object) { + if (obj_locked) { + VM_OBJECT_WUNLOCK(object); + obj_locked = false; } + object = m->object; } + if (!obj_locked) { + if (!VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(mtx); + VM_OBJECT_WLOCK(object); + obj_locked = true; + mtx_lock(mtx); + goto recheck; + } else + obj_locked = true; + } + if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at @@ -1232,26 +1391,8 @@ * inactive count. */ addl_page_shortage++; -unlock_object: - VM_OBJECT_WUNLOCK(object); -unlock_page: - vm_page_unlock(m); - continue; + goto reinsert; } - KASSERT(!vm_page_held(m), ("Held page %p", m)); - - /* - * Dequeue the inactive page and unlock the inactive page - * queue, invalidating the 'next' pointer. Dequeueing the - * page here avoids a later reacquisition (and release) of - * the inactive page queue lock when vm_page_activate(), - * vm_page_free(), or vm_page_launder() is called. Use a - * marker to remember our place in the inactive queue. - */ - TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); - vm_page_dequeue_locked(m); - vm_pagequeue_unlock(pq); - queue_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be @@ -1289,14 +1430,10 @@ * queue. */ m->act_count += act_delta + ACT_ADVANCE; - goto drop_page; + continue; } else if ((object->flags & OBJ_DEAD) == 0) { - vm_pagequeue_lock(pq); - queue_locked = TRUE; - m->queue = PQ_INACTIVE; - TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - vm_pagequeue_cnt_inc(pq); - goto drop_page; + vm_page_aflag_set(m, PGA_REQUEUE); + goto reinsert; } } @@ -1322,21 +1459,49 @@ */ if (m->dirty == 0) { free_page: - vm_page_free(m); - VM_CNT_INC(v_dfree); - --page_shortage; + page_shortage -= vm_pageout_free_pages(object, + m, &mtx); } else if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); -drop_page: - vm_page_unlock(m); - VM_OBJECT_WUNLOCK(object); - if (!queue_locked) { + continue; + + /* + * Re-add stuck pages to the queue. We will examine them again + * during the next scan. If the queue state of a page has + * changed since it was physically removed from the page queue, + * don't do anything with that page. + */ +reinsert: + if (!vm_batchqueue_insert(&rq, m)) { vm_pagequeue_lock(pq); - queue_locked = TRUE; + do { + if (!vm_page_inactive(m) || + (m->aflags & PGA_ENQUEUED) != 0) + continue; + vm_page_aflag_set(m, PGA_ENQUEUED); + if ((m->aflags & PGA_REQUEUE) != 0) { + TAILQ_INSERT_TAIL(&pq->pq_pl, m, + plinks.q); + vm_page_aflag_clear(m, PGA_REQUEUE); + } else + TAILQ_INSERT_BEFORE(ss.marker, m, + plinks.q); + vm_pagequeue_cnt_inc(pq); + } while ((m = vm_batchqueue_pop(&rq)) != NULL); + vm_pagequeue_unlock(pq); + vm_batchqueue_init(&rq); } - next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; + } + if (obj_locked) { + VM_OBJECT_WUNLOCK(object); + obj_locked = false; + } + vm_pagequeue_lock(pq); + vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); /* @@ -1399,9 +1564,9 @@ vm_paging_target(vmd) + deficit + addl_page_shortage; inactq_shortage *= act_scan_laundry_weight; + marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); - maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every @@ -1414,7 +1579,7 @@ min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; - if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) + if (min_scan > 0 || (inactq_shortage > 0 && pq->pq_cnt > 0)) vmd->vmd_last_active_scan = scan_tick; /* @@ -1422,35 +1587,40 @@ * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. */ - for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < - min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, - scanned++) { - KASSERT(m->queue == PQ_ACTIVE, - ("vm_pageout_scan: page %p isn't active", m)); - next = TAILQ_NEXT(m, plinks.q); - if ((m->flags & PG_MARKER) != 0) - continue; - KASSERT((m->flags & PG_FICTITIOUS) == 0, - ("Fictitious page %p cannot be in active queue", m)); - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("Unmanaged page %p cannot be in active queue", m)); - if (!vm_pageout_page_lock(m, &next)) { - vm_page_unlock(m); - continue; +act_scan: + TAILQ_INSERT_AFTER(&pq->pq_pl, &vmd->vmd_clock[0], marker, plinks.q); + vm_page_aflag_set(marker, PGA_ENQUEUED); + vm_pageout_init_scan(&ss, pq, marker, inactq_shortage > 0 ? + pq->pq_cnt : min_scan); + while ((m = vm_pageout_next(&ss, false)) != NULL) { + if (__predict_false(m == &vmd->vmd_clock[1])) { + vm_pagequeue_lock(pq); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); + TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0], + plinks.q); + TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1], + plinks.q); + vm_pageout_end_scan(&ss); + goto act_scan; } + if (__predict_false((m->flags & PG_MARKER) != 0)) + continue; + + vm_page_change_lock(m, &mtx); /* - * The count for page daemon pages is updated after checking - * the page for eligibility. + * The page may have been disassociated from the queue + * while locks were dropped. */ - VM_CNT_INC(v_pdpages); + if (!vm_pageout_page_queued(m, PQ_ACTIVE)) + continue; /* * Wired pages are dequeued lazily. */ if (m->wire_count != 0) { - vm_page_dequeue_locked(m); - vm_page_unlock(m); + vm_page_dequeue_lazy(m); continue; } @@ -1494,9 +1664,6 @@ * queue depending on usage. */ if (m->act_count == 0) { - /* Dequeue to avoid later lock recursion. */ - vm_page_dequeue_locked(m); - /* * When not short for inactive pages, let dirty pages go * through the inactive queue before moving to the @@ -1528,11 +1695,19 @@ inactq_shortage--; } } - } else - vm_page_requeue_locked(m); - vm_page_unlock(m); + } } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; + } + + vm_pagequeue_lock(pq); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); + TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); + vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); + if (pass > 0) vm_swapout_run_idle(); return (page_shortage <= 0); @@ -1779,10 +1954,8 @@ KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vmd->vmd_last_active_scan = ticks; - vm_pageout_init_marker(&vmd->vmd_marker, PQ_INACTIVE); - vm_pageout_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE); - TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, - &vmd->vmd_inacthead, plinks.q); + + vm_pageout_insert_markers(vmd); /* * The pageout daemon worker is never done, so loop forever. Index: sys/vm/vm_pagequeue.h =================================================================== --- sys/vm/vm_pagequeue.h +++ sys/vm/vm_pagequeue.h @@ -73,7 +73,18 @@ const char * const pq_name; } __aligned(CACHE_LINE_SIZE); +#ifndef VM_BATCHQUEUE_SIZE +#define VM_BATCHQUEUE_SIZE 7 +#endif + +struct vm_batchqueue { + vm_page_t bq_pa[VM_BATCHQUEUE_SIZE]; + int bq_cnt; +} __aligned(CACHE_LINE_SIZE); + +#include #include + struct sysctl_oid; /* @@ -81,17 +92,18 @@ * and accounting. * * Lock Key: - * f vmd_free_mtx - * p vmd_pageout_mtx - * d vm_domainset_lock - * a atomic - * c const after boot - * q page queue lock + * f vmd_free_mtx + * p vmd_pageout_mtx + * d vm_domainset_lock + * a atomic + * c const after boot + * q page queue lock */ struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; struct mtx_padalign vmd_free_mtx; struct mtx_padalign vmd_pageout_mtx; + uma_zone_t vmd_pgcache; /* (c) per-domain page free cache. */ struct vmem *vmd_kernel_arena; /* (c) per-domain kva arena. */ u_int vmd_domain; /* (c) Domain number. */ u_int vmd_page_count; /* (c) Total page count. */ @@ -105,9 +117,9 @@ boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; - struct vm_page vmd_laundry_marker; - struct vm_page vmd_marker; /* marker for pagedaemon private use */ + struct vm_page vmd_markers[PQ_COUNT]; struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ + struct vm_page vmd_clock[2]; /* markers for active queue scan */ int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */ int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */ @@ -143,6 +155,7 @@ #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) +#define vm_pagequeue_trylock(pq) mtx_trylock(&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #define vm_domain_free_assert_locked(n) \ @@ -153,6 +166,8 @@ mtx_lock(vm_domain_free_lockptr((d))) #define vm_domain_free_lockptr(d) \ (&(d)->vmd_free_mtx) +#define vm_domain_free_trylock(d) \ + mtx_trylock(vm_domain_free_lockptr((d))) #define vm_domain_free_unlock(d) \ mtx_unlock(vm_domain_free_lockptr((d))) @@ -171,14 +186,39 @@ vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { -#ifdef notyet vm_pagequeue_assert_locked(pq); -#endif pq->pq_cnt += addend; } #define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) +static inline void +vm_batchqueue_init(struct vm_batchqueue *bq) +{ + + bq->bq_cnt = 0; +} + +static inline bool +vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m) +{ + + if (bq->bq_cnt < nitems(bq->bq_pa)) { + bq->bq_pa[bq->bq_cnt++] = m; + return (true); + } + return (false); +} + +static inline vm_page_t +vm_batchqueue_pop(struct vm_batchqueue *bq) +{ + + if (bq->bq_cnt == 0) + return (NULL); + return (bq->bq_pa[--bq->bq_cnt]); +} + void vm_domain_set(struct vm_domain *vmd); void vm_domain_clear(struct vm_domain *vmd); int vm_domain_allocate(struct vm_domain *vmd, int req, int npages); Index: sys/vm/vm_phys.h =================================================================== --- sys/vm/vm_phys.h +++ sys/vm/vm_phys.h @@ -78,6 +78,7 @@ vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order); vm_page_t vm_phys_alloc_pages(int domain, int pool, int order); +int vm_phys_alloc_npages(int domain, int pool, vm_page_t *m, int cnt); int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high); int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr); Index: sys/vm/vm_phys.c =================================================================== --- sys/vm/vm_phys.c +++ sys/vm/vm_phys.c @@ -354,9 +354,9 @@ m->order = order; if (tail) - TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); + TAILQ_INSERT_TAIL(&fl[order].pl, m, listq); else - TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); + TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); fl[order].lcnt++; } @@ -364,7 +364,7 @@ vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) { - TAILQ_REMOVE(&fl[order].pl, m, plinks.q); + TAILQ_REMOVE(&fl[order].pl, m, listq); fl[order].lcnt--; m->order = VM_NFREEORDER; } @@ -624,6 +624,26 @@ return (NULL); } +int +vm_phys_alloc_npages(int domain, int pool, vm_page_t *mp, int cnt) +{ + vm_page_t m; + int order, freelist; + + for (freelist = 0; freelist < VM_NFREELIST; freelist++) { + for (order = fls(cnt) -1; order >= 0; order--) { + m = vm_phys_alloc_freelist_pages(domain, freelist, + pool, order); + if (m != NULL) { + *mp = m; + return (1 << order); + } + } + } + *mp = NULL; + return (0); +} + /* * Allocate a contiguous, power of two-sized set of physical pages from the * specified free list. The free list must be specified using one of the @@ -1176,7 +1196,7 @@ oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = (*seg->free_queues)[pind]; - TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { + TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { /* * Is the size of this allocation request * larger than the largest block size? Index: sys/vm/vm_reserv.c =================================================================== --- sys/vm/vm_reserv.c +++ sys/vm/vm_reserv.c @@ -419,7 +419,7 @@ index)); KASSERT(rv->popcnt > 0, ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); - KASSERT(rv->domain < vm_ndomains, + KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains, ("vm_reserv_depopulate: reserv %p's domain is corrupted %d", rv, rv->domain)); if (rv->popcnt == VM_LEVEL_0_NPAGES) { @@ -531,7 +531,7 @@ ("vm_reserv_populate: reserv %p is already full", rv)); KASSERT(rv->pages->psind == 0, ("vm_reserv_populate: reserv %p is already promoted", rv)); - KASSERT(rv->domain < vm_ndomains, + KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains, ("vm_reserv_populate: reserv %p's domain is corrupted %d", rv, rv->domain)); popmap_set(rv->popmap, index); @@ -1218,7 +1218,7 @@ vm_reserv_domain_lock(rv->domain); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); - KASSERT(rv->domain < vm_ndomains, + KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains, ("vm_reserv_reclaim: reserv %p's domain is corrupted %d", rv, rv->domain)); TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq); Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -249,6 +249,7 @@ object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; + object->iosize = vp->v_mount->mnt_stat.f_iosize; object->domain.dr_policy = vnode_domainset; object->handle = handle; @@ -769,7 +770,7 @@ object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); - bsize = vp->v_mount->mnt_stat.f_iosize; + bsize = object->iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size,