Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -578,6 +578,7 @@ int vm_page_sleep_if_busy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); +void vm_page_swapqueue(vm_page_t m, int oldq, int newq); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unswappable(vm_page_t m); @@ -669,6 +670,23 @@ */ CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); +/* + * We want to be able to update the aflags and queue fields atomically. + */ +CTASSERT(offsetof(struct vm_page, aflags) / sizeof(uint32_t) == + offsetof(struct vm_page, queue) / sizeof(uint32_t)); +CTASSERT(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2); +CTASSERT(sizeof(((struct vm_page *)NULL)->queue) == 1); + +#if BYTE_ORDER == LITTLE_ENDIAN +#define VM_PAGE_AFLAG_SHIFT 0 +#define VM_PAGE_QUEUE_SHIFT 16 +#else +#define VM_PAGE_AFLAG_SHIFT 24 +#define VM_PAGE_QUEUE_SHIFT 8 +#endif +#define VM_PAGE_QUEUE_MASK (0xff << VM_PAGE_QUEUE_SHIFT) + /* * Clear the given bits in the specified page. */ @@ -689,12 +707,7 @@ * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; - KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, - ("vm_page_aflag_clear: aflags is misaligned")); - val = bits; -#if BYTE_ORDER == BIG_ENDIAN - val <<= 24; -#endif + val = bits << VM_PAGE_AFLAG_SHIFT; atomic_clear_32(addr, val); } @@ -714,14 +727,40 @@ * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; - KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, - ("vm_page_aflag_set: aflags is misaligned")); - val = bits; -#if BYTE_ORDER == BIG_ENDIAN - val <<= 24; -#endif + val = bits << VM_PAGE_AFLAG_SHIFT; atomic_set_32(addr, val); -} +} + +/* + * Atomically update the queue state of the page. The operation fails if + * any of the queue flags in "fflags" are set or if the "queue" field of + * the page does not match the expected value; if the operation is + * successful, the flags in "nflags" are set and all other queue state + * flags are cleared. + */ +static inline bool +vm_page_aflag_queue_cmpset(vm_page_t m, int oldq, int newq, uint8_t fflags, + uint8_t nflags) +{ + uint32_t *addr, nval, oval, qsmask; + + vm_page_assert_locked(m); + + addr = (void *)&m->aflags; + oval = atomic_load_32(addr); + qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) << + VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK; + do { + if ((oval & (fflags << VM_PAGE_AFLAG_SHIFT)) != 0) + return (false); + if ((oval & VM_PAGE_QUEUE_MASK) != oldq << VM_PAGE_QUEUE_SHIFT) + return (false); + nval = (oval & ~qsmask) | (nflags << VM_PAGE_AFLAG_SHIFT) | + (newq << VM_PAGE_QUEUE_SHIFT); + } while (!atomic_fcmpset_32(addr, &oval, nval)); + + return (true); +} /* * vm_page_dirty: Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -3260,17 +3260,27 @@ if ((queue = vm_page_queue(m)) == PQ_NONE) return; - vm_page_aflag_set(m, PGA_DEQUEUE); - vm_pqbatch_submit_page(m, queue); + + /* + * Set PGA_DEQUEUE if it is not already set to handle a concurrent call + * to vm_page_dequeue_deferred_free(). In particular, avoid modifying + * the page's queue state once vm_page_dequeue_deferred_free() has been + * called. In the event of a race, two batch queue entries for the page + * will be created, but the second will have no effect. + */ + if (vm_page_aflag_queue_cmpset(m, queue, queue, PGA_DEQUEUE, + PGA_DEQUEUE)) + vm_pqbatch_submit_page(m, queue); } /* * A variant of vm_page_dequeue_deferred() that does not assert the page - * lock and is only to be called from vm_page_free_prep(). It is just an - * open-coded implementation of vm_page_dequeue_deferred(). Because the - * page is being freed, we can assume that nothing else is scheduling queue - * operations on this page, so we get for free the mutual exclusion that - * is otherwise provided by the page lock. + * lock and is only to be called from vm_page_free_prep(). Because the + * page is being freed, we can assume that nothing other than the page + * daemon is scheduling queue operations on this page, so we get for + * free the mutual exclusion that is otherwise provided by the page lock. + * To handle races, the page daemon must take care to atomically check + * for PGA_DEQUEUE when updating queue state. */ static void vm_page_dequeue_deferred_free(vm_page_t m) @@ -3388,6 +3398,55 @@ vm_pqbatch_submit_page(m, atomic_load_8(&m->queue)); } +/* + * vm_page_swapqueue: [ internal use only ] + * + * Move the page from one queue to another, or to the tail of its + * current queue, in the face of a possible concurrent call to + * vm_page_dequeue_deferred_free(). + */ +void +vm_page_swapqueue(vm_page_t m, int oldq, int newq) +{ + struct vm_pagequeue *pq; + + KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT, + ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("vm_page_swapqueue: page %p is unmanaged", m)); + vm_page_assert_locked(m); + + if (oldq == newq) { + /* + * Atomically set PGA_REQUEUE if the page belongs to the + * specified queue and does not have PGA_DEQUEUE set. + */ + if (vm_page_aflag_queue_cmpset(m, oldq, newq, + PGA_DEQUEUE | PGA_REQUEUE, PGA_REQUEUE)) + vm_pqbatch_submit_page(m, newq); + return; + } + + /* + * Atomically update the queue field and set PGA_REQUEUE while + * ensuring that PGA_DEQUEUE has not been set. + */ + pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq]; + vm_pagequeue_lock(pq); + if (!vm_page_aflag_queue_cmpset(m, oldq, newq, PGA_DEQUEUE, + PGA_REQUEUE)) { + vm_pagequeue_unlock(pq); + return; + } + if ((m->aflags & PGA_ENQUEUED) != 0) { + TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); + vm_pagequeue_cnt_dec(pq); + vm_page_aflag_clear(m, PGA_ENQUEUED); + } + vm_pagequeue_unlock(pq); + vm_pqbatch_submit_page(m, newq); +} + /* * vm_page_free_prep: * Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -742,7 +742,7 @@ * chance. */ if ((m->aflags & PGA_REQUEUE) != 0) { - vm_page_requeue(m); + vm_page_swapqueue(m, queue, queue); continue; } @@ -1256,9 +1256,9 @@ * place them directly in the laundry queue to reduce * queuing overhead. */ - if (page_shortage <= 0) - vm_page_deactivate(m); - else { + if (page_shortage <= 0) { + vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE); + } else { /* * Calling vm_page_test_dirty() here would * require acquisition of the object's write @@ -1270,11 +1270,13 @@ * dirty field by the pmap. */ if (m->dirty == 0) { - vm_page_deactivate(m); + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_INACTIVE); page_shortage -= act_scan_laundry_weight; } else { - vm_page_launder(m); + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_LAUNDRY); page_shortage--; } }