Index: sys/vm/vm_page.h
===================================================================
--- sys/vm/vm_page.h
+++ sys/vm/vm_page.h
@@ -147,28 +147,34 @@
  *	sleep until the page's busy state changes, after which the caller
  *	must re-lookup the page and re-evaluate its state.
  *
- *	The queue field is the index of the page queue containing the
- *	page, or PQ_NONE if the page is not enqueued.  The queue lock of a
- *	page is the page queue lock corresponding to the page queue index,
- *	or the page lock (P) for the page if it is not enqueued.  To modify
- *	the queue field, the queue lock for the old value of the field must
- *	be held.  It is invalid for a page's queue field to transition
- *	between two distinct page queue indices.  That is, when updating
- *	the queue field, either the new value or the old value must be
- *	PQ_NONE.
+ *	The queue field is the index of the page queue containing the page,
+ *	or PQ_NONE if the page is not enqueued.  The queue lock of a page is
+ *	the page queue lock corresponding to the page queue index, or the
+ *	page lock (P) for the page if it is not enqueued.  To modify the
+ *	queue field, the queue lock for the old value of the field must be
+ *	held.  There is one exception to this rule: the page daemon may
+ *	transition the queue field from PQ_INACTIVE to PQ_NONE immediately
+ *	prior to freeing a page during an inactive queue scan.  At that
+ *	point the page has already been physically dequeued and no other
+ *	references to that vm_page structure exist.
  *
  *	To avoid contention on page queue locks, page queue operations
- *	(enqueue, dequeue, requeue) are batched using per-CPU queues.
- *	A deferred operation is requested by inserting an entry into a
- *	batch queue; the entry is simply a pointer to the page, and the
- *	request type is encoded in the page's aflags field using the values
- *	in PGA_QUEUE_STATE_MASK.  The type-stability of struct vm_pages is
+ *	(enqueue, dequeue, requeue) are batched using per-CPU queues.  A
+ *	deferred operation is requested by inserting an entry into a batch
+ *	queue; the entry is simply a pointer to the page, and the request
+ *	type is encoded in the page's aflags field using the values in
+ *	PGA_QUEUE_STATE_MASK.  The type-stability of struct vm_pages is
  *	crucial to this scheme since the processing of entries in a given
- *	batch queue may be deferred indefinitely.  In particular, a page
- *	may be freed before its pending batch queue entries have been
- *	processed.  The page lock (P) must be held to schedule a batched
- *	queue operation, and the page queue lock must be held in order to
- *	process batch queue entries for the page queue.
+ *	batch queue may be deferred indefinitely.  In particular, a page may
+ *	be freed before its pending batch queue entries have been processed.
+ *	The page lock (P) must be held to schedule a batched queue
+ *	operation, and the page queue lock must be held in order to process
+ *	batch queue entries for the page queue.  There is one exception to
+ *	this rule: the thread freeing a page may schedule a dequeue without
+ *	holding the page lock.  In this scenario the only other thread which
+ *	may hold a reference to the page is the page daemon, which is
+ *	careful to avoid modifying the page's queue state once the dequeue
+ *	has been requested by setting PGA_DEQUEUE.
  */
 
 #if PAGE_SIZE == 4096
@@ -578,6 +584,7 @@
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
 void vm_page_sunbusy(vm_page_t m);
+void vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq);
 int vm_page_trysbusy(vm_page_t m);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
 void vm_page_unswappable(vm_page_t m);
@@ -667,7 +674,29 @@
  * destinations.  In order that we can easily use a 32-bit operation, we
  * require that the aflags field be 32-bit aligned.
  */
-CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
+_Static_assert(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0,
+    "aflags field is not 32-bit aligned");
+
+/*
+ * We want to be able to update the aflags and queue fields atomically in
+ * the same operation.
+ */
+_Static_assert(offsetof(struct vm_page, aflags) / sizeof(uint32_t) ==
+    offsetof(struct vm_page, queue) / sizeof(uint32_t),
+    "aflags and queue fields do not belong to the same 32-bit word");
+_Static_assert(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2,
+    "queue field is at an unexpected offset");
+_Static_assert(sizeof(((struct vm_page *)NULL)->queue) == 1,
+    "queue field has an unexpected size");
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define	VM_PAGE_AFLAG_SHIFT	0
+#define	VM_PAGE_QUEUE_SHIFT	16
+#else
+#define	VM_PAGE_AFLAG_SHIFT	24
+#define	VM_PAGE_QUEUE_SHIFT	8
+#endif
+#define	VM_PAGE_QUEUE_MASK	(0xff << VM_PAGE_QUEUE_SHIFT)
 
 /*
  *	Clear the given bits in the specified page.
@@ -689,12 +718,7 @@
 	 * within this word are handled properly by the atomic update.
 	 */
 	addr = (void *)&m->aflags;
-	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
-	    ("vm_page_aflag_clear: aflags is misaligned"));
-	val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
-	val <<= 24;
-#endif
+	val = bits << VM_PAGE_AFLAG_SHIFT;
 	atomic_clear_32(addr, val);
 }
 
@@ -724,14 +748,44 @@
 	 * within this word are handled properly by the atomic update.
 	 */
 	addr = (void *)&m->aflags;
-	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
-	    ("vm_page_aflag_set: aflags is misaligned"));
-	val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
-	val <<= 24;
-#endif
+	val = bits << VM_PAGE_AFLAG_SHIFT;
 	atomic_set_32(addr, val);
-} 
+}
+
+/*
+ *	Atomically update the queue state of the page.  The operation fails if
+ *	any of the queue flags in "fflags" are set or if the "queue" field of
+ *	the page does not match the expected value; if the operation is
+ *	successful, the flags in "nflags" are set and all other queue state
+ *	flags are cleared.
+ */
+static inline bool
+vm_page_pqstate_cmpset(vm_page_t m, uint32_t oldq, uint32_t newq,
+    uint32_t fflags, uint32_t nflags)
+{
+	uint32_t *addr, nval, oval, qsmask;
+
+	vm_page_assert_locked(m);
+
+	fflags <<= VM_PAGE_AFLAG_SHIFT;
+	nflags <<= VM_PAGE_AFLAG_SHIFT;
+	newq <<= VM_PAGE_QUEUE_SHIFT;
+	oldq <<= VM_PAGE_QUEUE_SHIFT;
+	qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) <<
+	    VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK;
+
+	addr = (void *)&m->aflags;
+	oval = atomic_load_32(addr);
+	do {
+		if ((oval & fflags) != 0)
+			return (false);
+		if ((oval & VM_PAGE_QUEUE_MASK) != oldq)
+			return (false);
+		nval = (oval & ~qsmask) | nflags | newq;
+	} while (!atomic_fcmpset_32(addr, &oval, nval));
+
+	return (true);
+}
 
 /*
  *	vm_page_dirty:
Index: sys/vm/vm_page.c
===================================================================
--- sys/vm/vm_page.c
+++ sys/vm/vm_page.c
@@ -3134,6 +3134,13 @@
 	vm_batchqueue_init(bq);
 }
 
+/*
+ *	vm_page_pqbatch_submit:		[ internal use only ]
+ *
+ *	Enqueue a page in the specified page queue's batched work queue.
+ *	The caller must have encoded the requested operation in the page
+ *	structure's aflags field.
+ */
 void
 vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
 {
@@ -3255,17 +3262,26 @@
 
 	if ((queue = vm_page_queue(m)) == PQ_NONE)
 		return;
-	vm_page_aflag_set(m, PGA_DEQUEUE);
-	vm_page_pqbatch_submit(m, queue);
+
+	/*
+	 * Set PGA_DEQUEUE if it is not already set to handle a concurrent call
+	 * to vm_page_dequeue_deferred_free().  In particular, avoid modifying
+	 * the page's queue state once vm_page_dequeue_deferred_free() has been
+	 * called.  In the event of a race, two batch queue entries for the page
+	 * will be created, but the second will have no effect.
+	 */
+	if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, PGA_DEQUEUE))
+		vm_page_pqbatch_submit(m, queue);
 }
 
 /*
  * A variant of vm_page_dequeue_deferred() that does not assert the page
- * lock and is only to be called from vm_page_free_prep().  It is just an
- * open-coded implementation of vm_page_dequeue_deferred().  Because the
- * page is being freed, we can assume that nothing else is scheduling queue
- * operations on this page, so we get for free the mutual exclusion that
- * is otherwise provided by the page lock.
+ * lock and is only to be called from vm_page_free_prep().  Because the
+ * page is being freed, we can assume that nothing other than the page
+ * daemon is scheduling queue operations on this page, so we get for
+ * free the mutual exclusion that is otherwise provided by the page lock.
+ * To handle races, the page daemon must take care to atomically check
+ * for PGA_DEQUEUE when updating queue state.
  */
 static void
 vm_page_dequeue_deferred_free(vm_page_t m)
@@ -3378,6 +3394,42 @@
 	vm_page_pqbatch_submit(m, atomic_load_8(&m->queue));
 }
 
+/*
+ *	vm_page_swapqueue:		[ internal use only ]
+ *
+ *	Move the page from one queue to another, or to the tail of its
+ *	current queue, in the face of a possible concurrent call to
+ *	vm_page_dequeue_deferred_free().
+ */
+void
+vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq)
+{
+	struct vm_pagequeue *pq;
+
+	KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT && oldq != newq,
+	    ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq));
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("vm_page_swapqueue: page %p is unmanaged", m));
+	vm_page_assert_locked(m);
+
+	/*
+	 * Atomically update the queue field and set PGA_REQUEUE while
+	 * ensuring that PGA_DEQUEUE has not been set.
+	 */
+	pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq];
+	vm_pagequeue_lock(pq);
+	if (!vm_page_pqstate_cmpset(m, oldq, newq, PGA_DEQUEUE, PGA_REQUEUE)) {
+		vm_pagequeue_unlock(pq);
+		return;
+	}
+	if ((m->aflags & PGA_ENQUEUED) != 0) {
+		vm_pagequeue_remove(pq, m);
+		vm_page_aflag_clear(m, PGA_ENQUEUED);
+	}
+	vm_pagequeue_unlock(pq);
+	vm_page_pqbatch_submit(m, newq);
+}
+
 /*
  *	vm_page_free_prep:
  *
Index: sys/vm/vm_pageout.c
===================================================================
--- sys/vm/vm_pageout.c
+++ sys/vm/vm_pageout.c
@@ -742,7 +742,7 @@
 		 * chance.
 		 */
 		if ((m->aflags & PGA_REQUEUE) != 0) {
-			vm_page_requeue(m);
+			vm_page_pqbatch_submit(m, queue);
 			continue;
 		}
 
@@ -1256,9 +1256,9 @@
 			 * place them directly in the laundry queue to reduce
 			 * queuing overhead.
 			 */
-			if (page_shortage <= 0)
-				vm_page_deactivate(m);
-			else {
+			if (page_shortage <= 0) {
+				vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE);
+			} else {
 				/*
 				 * Calling vm_page_test_dirty() here would
 				 * require acquisition of the object's write
@@ -1270,11 +1270,13 @@
 				 * dirty field by the pmap.
 				 */
 				if (m->dirty == 0) {
-					vm_page_deactivate(m);
+					vm_page_swapqueue(m, PQ_ACTIVE,
+					    PQ_INACTIVE);
 					page_shortage -=
 					    act_scan_laundry_weight;
 				} else {
-					vm_page_launder(m);
+					vm_page_swapqueue(m, PQ_ACTIVE,
+					    PQ_LAUNDRY);
 					page_shortage--;
 				}
 			}