Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -147,28 +147,34 @@ * sleep until the page's busy state changes, after which the caller * must re-lookup the page and re-evaluate its state. * - * The queue field is the index of the page queue containing the - * page, or PQ_NONE if the page is not enqueued. The queue lock of a - * page is the page queue lock corresponding to the page queue index, - * or the page lock (P) for the page if it is not enqueued. To modify - * the queue field, the queue lock for the old value of the field must - * be held. It is invalid for a page's queue field to transition - * between two distinct page queue indices. That is, when updating - * the queue field, either the new value or the old value must be - * PQ_NONE. + * The queue field is the index of the page queue containing the page, + * or PQ_NONE if the page is not enqueued. The queue lock of a page is + * the page queue lock corresponding to the page queue index, or the + * page lock (P) for the page if it is not enqueued. To modify the + * queue field, the queue lock for the old value of the field must be + * held. There is one exception to this rule: the page daemon may + * transition the queue field from PQ_INACTIVE to PQ_NONE immediately + * prior to freeing a page during an inactive queue scan. At that + * point the page has already been physically dequeued and no other + * references to that vm_page structure exist. * * To avoid contention on page queue locks, page queue operations - * (enqueue, dequeue, requeue) are batched using per-CPU queues. - * A deferred operation is requested by inserting an entry into a - * batch queue; the entry is simply a pointer to the page, and the - * request type is encoded in the page's aflags field using the values - * in PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is + * (enqueue, dequeue, requeue) are batched using per-CPU queues. A + * deferred operation is requested by inserting an entry into a batch + * queue; the entry is simply a pointer to the page, and the request + * type is encoded in the page's aflags field using the values in + * PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is * crucial to this scheme since the processing of entries in a given - * batch queue may be deferred indefinitely. In particular, a page - * may be freed before its pending batch queue entries have been - * processed. The page lock (P) must be held to schedule a batched - * queue operation, and the page queue lock must be held in order to - * process batch queue entries for the page queue. + * batch queue may be deferred indefinitely. In particular, a page may + * be freed before its pending batch queue entries have been processed. + * The page lock (P) must be held to schedule a batched queue + * operation, and the page queue lock must be held in order to process + * batch queue entries for the page queue. There is one exception to + * this rule: the thread freeing a page may schedule a dequeue without + * holding the page lock. In this scenario the only other thread which + * may hold a reference to the page is the page daemon, which is + * careful to avoid modifying the page's queue state once the dequeue + * has been requested by setting PGA_DEQUEUE. */ #if PAGE_SIZE == 4096 @@ -578,6 +584,7 @@ int vm_page_sleep_if_busy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); +void vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unswappable(vm_page_t m); @@ -667,7 +674,29 @@ * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ -CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); +_Static_assert(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0, + "aflags field is not 32-bit aligned"); + +/* + * We want to be able to update the aflags and queue fields atomically in + * the same operation. + */ +_Static_assert(offsetof(struct vm_page, aflags) / sizeof(uint32_t) == + offsetof(struct vm_page, queue) / sizeof(uint32_t), + "aflags and queue fields do not belong to the same 32-bit word"); +_Static_assert(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2, + "queue field is at an unexpected offset"); +_Static_assert(sizeof(((struct vm_page *)NULL)->queue) == 1, + "queue field has an unexpected size"); + +#if BYTE_ORDER == LITTLE_ENDIAN +#define VM_PAGE_AFLAG_SHIFT 0 +#define VM_PAGE_QUEUE_SHIFT 16 +#else +#define VM_PAGE_AFLAG_SHIFT 24 +#define VM_PAGE_QUEUE_SHIFT 8 +#endif +#define VM_PAGE_QUEUE_MASK (0xff << VM_PAGE_QUEUE_SHIFT) /* * Clear the given bits in the specified page. @@ -689,12 +718,7 @@ * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; - KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, - ("vm_page_aflag_clear: aflags is misaligned")); - val = bits; -#if BYTE_ORDER == BIG_ENDIAN - val <<= 24; -#endif + val = bits << VM_PAGE_AFLAG_SHIFT; atomic_clear_32(addr, val); } @@ -724,14 +748,44 @@ * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; - KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, - ("vm_page_aflag_set: aflags is misaligned")); - val = bits; -#if BYTE_ORDER == BIG_ENDIAN - val <<= 24; -#endif + val = bits << VM_PAGE_AFLAG_SHIFT; atomic_set_32(addr, val); -} +} + +/* + * Atomically update the queue state of the page. The operation fails if + * any of the queue flags in "fflags" are set or if the "queue" field of + * the page does not match the expected value; if the operation is + * successful, the flags in "nflags" are set and all other queue state + * flags are cleared. + */ +static inline bool +vm_page_pqstate_cmpset(vm_page_t m, uint32_t oldq, uint32_t newq, + uint32_t fflags, uint32_t nflags) +{ + uint32_t *addr, nval, oval, qsmask; + + vm_page_assert_locked(m); + + fflags <<= VM_PAGE_AFLAG_SHIFT; + nflags <<= VM_PAGE_AFLAG_SHIFT; + newq <<= VM_PAGE_QUEUE_SHIFT; + oldq <<= VM_PAGE_QUEUE_SHIFT; + qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) << + VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK; + + addr = (void *)&m->aflags; + oval = atomic_load_32(addr); + do { + if ((oval & fflags) != 0) + return (false); + if ((oval & VM_PAGE_QUEUE_MASK) != oldq) + return (false); + nval = (oval & ~qsmask) | nflags | newq; + } while (!atomic_fcmpset_32(addr, &oval, nval)); + + return (true); +} /* * vm_page_dirty: Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -3134,6 +3134,13 @@ vm_batchqueue_init(bq); } +/* + * vm_page_pqbatch_submit: [ internal use only ] + * + * Enqueue a page in the specified page queue's batched work queue. + * The caller must have encoded the requested operation in the page + * structure's aflags field. + */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) { @@ -3255,17 +3262,26 @@ if ((queue = vm_page_queue(m)) == PQ_NONE) return; - vm_page_aflag_set(m, PGA_DEQUEUE); - vm_page_pqbatch_submit(m, queue); + + /* + * Set PGA_DEQUEUE if it is not already set to handle a concurrent call + * to vm_page_dequeue_deferred_free(). In particular, avoid modifying + * the page's queue state once vm_page_dequeue_deferred_free() has been + * called. In the event of a race, two batch queue entries for the page + * will be created, but the second will have no effect. + */ + if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, PGA_DEQUEUE)) + vm_page_pqbatch_submit(m, queue); } /* * A variant of vm_page_dequeue_deferred() that does not assert the page - * lock and is only to be called from vm_page_free_prep(). It is just an - * open-coded implementation of vm_page_dequeue_deferred(). Because the - * page is being freed, we can assume that nothing else is scheduling queue - * operations on this page, so we get for free the mutual exclusion that - * is otherwise provided by the page lock. + * lock and is only to be called from vm_page_free_prep(). Because the + * page is being freed, we can assume that nothing other than the page + * daemon is scheduling queue operations on this page, so we get for + * free the mutual exclusion that is otherwise provided by the page lock. + * To handle races, the page daemon must take care to atomically check + * for PGA_DEQUEUE when updating queue state. */ static void vm_page_dequeue_deferred_free(vm_page_t m) @@ -3378,6 +3394,42 @@ vm_page_pqbatch_submit(m, atomic_load_8(&m->queue)); } +/* + * vm_page_swapqueue: [ internal use only ] + * + * Move the page from one queue to another, or to the tail of its + * current queue, in the face of a possible concurrent call to + * vm_page_dequeue_deferred_free(). + */ +void +vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq) +{ + struct vm_pagequeue *pq; + + KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT && oldq != newq, + ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("vm_page_swapqueue: page %p is unmanaged", m)); + vm_page_assert_locked(m); + + /* + * Atomically update the queue field and set PGA_REQUEUE while + * ensuring that PGA_DEQUEUE has not been set. + */ + pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq]; + vm_pagequeue_lock(pq); + if (!vm_page_pqstate_cmpset(m, oldq, newq, PGA_DEQUEUE, PGA_REQUEUE)) { + vm_pagequeue_unlock(pq); + return; + } + if ((m->aflags & PGA_ENQUEUED) != 0) { + vm_pagequeue_remove(pq, m); + vm_page_aflag_clear(m, PGA_ENQUEUED); + } + vm_pagequeue_unlock(pq); + vm_page_pqbatch_submit(m, newq); +} + /* * vm_page_free_prep: * Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -742,7 +742,7 @@ * chance. */ if ((m->aflags & PGA_REQUEUE) != 0) { - vm_page_requeue(m); + vm_page_pqbatch_submit(m, queue); continue; } @@ -1256,9 +1256,9 @@ * place them directly in the laundry queue to reduce * queuing overhead. */ - if (page_shortage <= 0) - vm_page_deactivate(m); - else { + if (page_shortage <= 0) { + vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE); + } else { /* * Calling vm_page_test_dirty() here would * require acquisition of the object's write @@ -1270,11 +1270,13 @@ * dirty field by the pmap. */ if (m->dirty == 0) { - vm_page_deactivate(m); + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_INACTIVE); page_shortage -= act_scan_laundry_weight; } else { - vm_page_launder(m); + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_LAUNDRY); page_shortage--; } }