D14893.id40974.diff
No OneTemporary
Actions

Size

68 KB

Referenced Files

None

Subscribers

None

D14893.id40974.diff
View Options

	Index: sys/amd64/include/vmparam.h
	===================================================================
	--- sys/amd64/include/vmparam.h
	+++ sys/amd64/include/vmparam.h
	@@ -227,4 +227,10 @@

	#define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */

	+/*
	+ * Use a fairly large batch size since we expect amd64 systems to have lots of
	+ * memory.
	+ */
	+#define VM_BATCHQUEUE_SIZE 31
	+
	#endif /* _MACHINE_VMPARAM_H_ */
	Index: sys/kern/subr_witness.c
	===================================================================
	--- sys/kern/subr_witness.c
	+++ sys/kern/subr_witness.c
	@@ -601,7 +601,6 @@
	* CDEV
	*/
	{ "vm map (system)", &lock_class_mtx_sleep },
	- { "vm pagequeue", &lock_class_mtx_sleep },
	{ "vnode interlock", &lock_class_mtx_sleep },
	{ "cdev", &lock_class_mtx_sleep },
	{ NULL, NULL },
	@@ -611,11 +610,11 @@
	{ "vm map (user)", &lock_class_sx },
	{ "vm object", &lock_class_rw },
	{ "vm page", &lock_class_mtx_sleep },
	- { "vm pagequeue", &lock_class_mtx_sleep },
	{ "pmap pv global", &lock_class_rw },
	{ "pmap", &lock_class_mtx_sleep },
	{ "pmap pv list", &lock_class_rw },
	{ "vm page free queue", &lock_class_mtx_sleep },
	+ { "vm pagequeue", &lock_class_mtx_sleep },
	{ NULL, NULL },
	/*
	* kqueue/VFS interaction
	Index: sys/vm/vm_object.h
	===================================================================
	--- sys/vm/vm_object.h
	+++ sys/vm/vm_object.h
	@@ -111,6 +111,7 @@
	objtype_t type; /* type of pager */
	u_short flags; /* see below */
	u_short pg_color; /* (c) color of first page in obj */
	+ u_int iosize; /* (c) Natural I/O size in bytes. */
	u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */
	int resident_page_count; /* number of resident pages */
	struct vm_object backing_object; / object that I'm a shadow of */
	Index: sys/vm/vm_object.c
	===================================================================
	--- sys/vm/vm_object.c
	+++ sys/vm/vm_object.c
	@@ -282,6 +282,7 @@
	object->handle = NULL;
	object->backing_object = NULL;
	object->backing_object_offset = (vm_ooffset_t) 0;
	+ object->iosize = 0;
	#if VM_NRESERVLEVEL > 0
	LIST_INIT(&object->rvq);
	#endif
	@@ -720,14 +721,11 @@
	vm_object_terminate_pages(vm_object_t object)
	{
	vm_page_t p, p_next;
	- struct mtx mtx, mtx1;
	- struct vm_pagequeue pq, pq1;
	- int dequeued;
	+ struct mtx *mtx;

	VM_OBJECT_ASSERT_WLOCKED(object);

	mtx = NULL;
	- pq = NULL;

	/*
	* Free any remaining pageable pages. This also removes them from the
	@@ -737,60 +735,23 @@
	*/
	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
	vm_page_assert_unbusied(p);
	- if ((object->flags & OBJ_UNMANAGED) == 0) {
	+ if ((object->flags & OBJ_UNMANAGED) == 0)
	/*
	* vm_page_free_prep() only needs the page
	* lock for managed pages.
	*/
	- mtx1 = vm_page_lockptr(p);
	- if (mtx1 != mtx) {
	- if (mtx != NULL)
	- mtx_unlock(mtx);
	- if (pq != NULL) {
	- vm_pagequeue_cnt_add(pq, dequeued);
	- vm_pagequeue_unlock(pq);
	- pq = NULL;
	- }
	- mtx = mtx1;
	- mtx_lock(mtx);
	- }
	- }
	+ vm_page_change_lock(p, &mtx);
	p->object = NULL;
	if (p->wire_count != 0)
	- goto unlist;
	+ continue;
	VM_CNT_INC(v_pfree);
	p->flags &= ~PG_ZERO;
	- if (p->queue != PQ_NONE) {
	- KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
	- "page %p is not queued", p));
	- pq1 = vm_page_pagequeue(p);
	- if (pq != pq1) {
	- if (pq != NULL) {
	- vm_pagequeue_cnt_add(pq, dequeued);
	- vm_pagequeue_unlock(pq);
	- }
	- pq = pq1;
	- vm_pagequeue_lock(pq);
	- dequeued = 0;
	- }
	- p->queue = PQ_NONE;
	- TAILQ_REMOVE(&pq->pq_pl, p, plinks.q);
	- dequeued--;
	- }
	- if (vm_page_free_prep(p, true))
	- continue;
	-unlist:
	- TAILQ_REMOVE(&object->memq, p, listq);
	- }
	- if (pq != NULL) {
	- vm_pagequeue_cnt_add(pq, dequeued);
	- vm_pagequeue_unlock(pq);
	+
	+ vm_page_free(p);
	}
	if (mtx != NULL)
	mtx_unlock(mtx);

	- vm_page_free_phys_pglist(&object->memq);
	-
	/*
	* If the object contained any pages, then reset it to an empty state.
	* None of the object's fields, including "resident_page_count", were
	@@ -1973,7 +1934,6 @@
	{
	vm_page_t p, next;
	struct mtx *mtx;
	- struct pglist pgl;

	VM_OBJECT_ASSERT_WLOCKED(object);
	KASSERT((object->flags & OBJ_UNMANAGED) == 0 \|\|
	@@ -1982,7 +1942,6 @@
	if (object->resident_page_count == 0)
	return;
	vm_object_pip_add(object, 1);
	- TAILQ_INIT(&pgl);
	again:
	p = vm_page_find_least(object, start);
	mtx = NULL;
	@@ -2037,12 +1996,10 @@
	if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
	pmap_remove_all(p);
	p->flags &= ~PG_ZERO;
	- if (vm_page_free_prep(p, false))
	- TAILQ_INSERT_TAIL(&pgl, p, listq);
	+ vm_page_free(p);
	}
	if (mtx != NULL)
	mtx_unlock(mtx);
	- vm_page_free_phys_pglist(&pgl);
	vm_object_pip_wakeup(object);
	}

	Index: sys/vm/vm_page.h
	===================================================================
	--- sys/vm/vm_page.h
	+++ sys/vm/vm_page.h
	@@ -94,7 +94,9 @@
	* In general, operations on this structure's mutable fields are
	* synchronized using either one of or a combination of the lock on the
	* object that the page belongs to (O), the pool lock for the page (P),
	- * or the lock for either the free or paging queue (Q). If a field is
	+ * the per-domain lock for the free queues (F), or the page's queue
	+ * lock (Q). The queue lock for a page depends on the value of its
	+ * queue field and described in detail below. If a field is
	* annotated below with two of these locks, then holding either lock is
	* sufficient for read access, but both locks are required for write
	* access. An annotation of (C) indicates that the field is immutable.
	@@ -143,6 +145,28 @@
	* causing the thread to block. vm_page_sleep_if_busy() can be used to
	* sleep until the page's busy state changes, after which the caller
	* must re-lookup the page and re-evaluate its state.
	+ *
	+ * The queue field is the index of the page queue containing the
	+ * page, or PQ_NONE if the page is not enqueued. The queue lock of a
	+ * page is the page queue lock corresponding to the page queue index,
	+ * or the page lock (P) for the page. To modify the queue field, the
	+ * queue lock for the old value of the field must be held. It is
	+ * invalid for a page's queue field to transition between two distinct
	+ * page queue indices. That is, when updating the queue field, either
	+ * the new value or the old value must be PQ_NONE.
	+ *
	+ * To avoid contention on page queue locks, page queue operations
	+ * (enqueue, dequeue, requeue) are batched using per-CPU queues.
	+ * A deferred operation is requested by inserting an entry into a
	+ * batch queue; the entry is simply a pointer to the page, and the
	+ * request type is encoded in the page's aflags field using the values
	+ * in PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is
	+ * crucial to this scheme since the processing of entries in a given
	+ * batch queue may be deferred indefinitely. In particular, a page
	+ * may be freed before its pending batch queue entries have been
	+ * processed. The page lock (P) must be held to schedule a batched
	+ * queue operation, and the page queue lock must be held in order to
	+ * process batch queue entries for the page queue.
	*/

	#if PAGE_SIZE == 4096
	@@ -174,7 +198,7 @@
	TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */
	vm_object_t object; /* which object am I in (O,P) */
	vm_pindex_t pindex; /* offset into object (O,P) */
	- vm_paddr_t phys_addr; /* physical address of page */
	+ vm_paddr_t phys_addr; /* physical address of page (C) */
	struct md_page md; /* machine dependent stuff */
	u_int wire_count; /* wired down maps refs (P) */
	volatile u_int busy_lock; /* busy owners lock */
	@@ -182,11 +206,11 @@
	uint16_t flags; /* page PG_* flags (P) */
	uint8_t aflags; /* access is atomic */
	uint8_t oflags; /* page VPO_* flags (O) */
	- uint8_t queue; /* page queue index (P,Q) */
	+ uint8_t queue; /* page queue index (Q) */
	int8_t psind; /* pagesizes[] index (O) */
	int8_t segind; /* vm_phys segment index (C) */
	- uint8_t order; /* index of the buddy queue */
	- uint8_t pool; /* vm_phys freepool index (Q) */
	+ uint8_t order; /* index of the buddy queue (F) */
	+ uint8_t pool; /* vm_phys freepool index (F) */
	u_char act_count; /* page usage count (P) */
	/* NOTE that these must support one bit per DEV_BSIZE in a page */
	/* so, on normal X86 kernels, they must be at least 8 bits wide */
	@@ -314,10 +338,32 @@
	*
	* PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
	* at least one executable mapping. It is not consumed by the MI VM layer.
	+ *
	+ * PGA_ENQUEUED is set and cleared when a page is inserted into or removed
	+ * from a page queue, respectively. It determines whether the plinks.q field
	+ * of the page is valid. To set or clear this flag, the queue lock for the
	+ * page must be held: the page queue lock corresponding to the page's "queue"
	+ * field if its value is not PQ_NONE, and the page lock otherwise.
	+ *
	+ * PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page
	+ * queue, and cleared when the dequeue request is processed. A page may
	+ * have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue
	+ * is requested after the page is scheduled to be enqueued but before it is
	+ * actually inserted into the page queue. The page lock must be held to set
	+ * this flag, and the queue lock for the page must be held to clear it.
	+ *
	+ * PGA_REQUEUE is set when the page is scheduled to be requeued in its page
	+ * queue. The page lock must be held to set this flag, and the queue lock
	+ * for the page must be held to clear it.
	*/
	#define PGA_WRITEABLE 0x01 /* page may be mapped writeable */
	#define PGA_REFERENCED 0x02 /* page has been referenced */
	#define PGA_EXECUTABLE 0x04 /* page may be mapped executable */
	+#define PGA_ENQUEUED 0x08 /* page is enqueued in a page queue */
	+#define PGA_DEQUEUE 0x10 /* page is due to be dequeued */
	+#define PGA_REQUEUE 0x20 /* page is due to be requeued */
	+
	+#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED \| PGA_DEQUEUE \| PGA_REQUEUE)

	/*
	* Page flags. If changed at any other time than page allocation or
	@@ -483,10 +529,10 @@
	void vm_page_deactivate(vm_page_t);
	void vm_page_deactivate_noreuse(vm_page_t);
	void vm_page_dequeue(vm_page_t m);
	+void vm_page_dequeue_lazy(vm_page_t m);
	void vm_page_dequeue_locked(vm_page_t m);
	vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
	-void vm_page_free_phys_pglist(struct pglist *tq);
	-bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
	+bool vm_page_free_prep(vm_page_t m);
	vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
	void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
	int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
	Index: sys/vm/vm_page.c
	===================================================================
	--- sys/vm/vm_page.c
	+++ sys/vm/vm_page.c
	@@ -131,13 +131,11 @@
	extern void uma_startup(void *, int);
	extern int vmem_startup_count(void);

	-/*
	- * Associated with page of user-allocatable memory is a
	- * page structure.
	- */
	-
	struct vm_domain vm_dom[MAXMEMDOM];

	+static DPCPU_DEFINE(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
	+static DPCPU_DEFINE(struct vm_batchqueue, noreuseq[MAXMEMDOM]);
	+
	struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];

	struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
	@@ -176,7 +174,7 @@

	static void vm_page_alloc_check(vm_page_t m);
	static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
	-static void vm_page_enqueue(uint8_t queue, vm_page_t m);
	+static void vm_page_enqueue_lazy(vm_page_t m, uint8_t queue);
	static void vm_page_init(void *dummy);
	static int vm_page_insert_after(vm_page_t m, vm_object_t object,
	vm_pindex_t pindex, vm_page_t mpred);
	@@ -186,6 +184,9 @@
	vm_page_t m_run, vm_paddr_t high);
	static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
	int req);
	+static int vm_page_import(void arg, void *store, int cnt, int domain,
	+ int flags);
	+static void vm_page_release(void arg, void *store, int cnt);

	SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);

	@@ -199,6 +200,33 @@
	VM_ALLOC_NORMAL \| VM_ALLOC_WIRED);
	}

	+/*
	+ * The cache page zone is initialized later since we need to be able to allocate
	+ * pages before UMA is fully initialized.
	+ */
	+static void
	+vm_page_init_cache_zones(void *dummy __unused)
	+{
	+ struct vm_domain *vmd;
	+ int i;
	+
	+ for (i = 0; i < vm_ndomains; i++) {
	+ vmd = VM_DOMAIN(i);
	+ /*
	+ * Don't allow the page cache to take up more than .25% of
	+ * memory.
	+ */
	+ if (vmd->vmd_page_count / 400 < 256 * mp_ncpus)
	+ continue;
	+ vmd->vmd_pgcache = uma_zcache_create("vm pgcache",
	+ sizeof(struct vm_page), NULL, NULL, NULL, NULL,
	+ vm_page_import, vm_page_release, vmd,
	+ /* UMA_ZONE_NOBUCKETCACHE \|*/
	+ UMA_ZONE_MAXBUCKET \| UMA_ZONE_VM);
	+ }
	+}
	+SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
	+
	/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
	#if PAGE_SIZE == 32768
	#ifdef CTASSERT
	@@ -1786,6 +1814,7 @@
	#if VM_NRESERVLEVEL > 0
	found:
	#endif
	+ vm_page_dequeue(m);
	vm_page_alloc_check(m);

	/*
	@@ -1982,8 +2011,10 @@
	#if VM_NRESERVLEVEL > 0
	found:
	#endif
	- for (m = m_ret; m < &m_ret[npages]; m++)
	+ for (m = m_ret; m < &m_ret[npages]; m++) {
	+ vm_page_dequeue(m);
	vm_page_alloc_check(m);
	+ }

	/*
	* Initialize the pages. Only the PG_ZERO flag is inherited.
	@@ -2127,6 +2158,7 @@
	goto again;
	return (NULL);
	}
	+ vm_page_dequeue(m);
	vm_page_alloc_check(m);

	/*
	@@ -2150,6 +2182,51 @@
	return (m);
	}

	+static int
	+vm_page_import(void arg, void *store, int cnt, int domain, int flags)
	+{
	+ struct vm_domain *vmd;
	+ vm_page_t m;
	+ int i, j, n;
	+
	+ vmd = arg;
	+ domain = vmd->vmd_domain;
	+ n = 64; /* Starting stride. */
	+ vm_domain_free_lock(vmd);
	+ for (i = 0; i < cnt; i+=n) {
	+ n = vm_phys_alloc_npages(domain, VM_FREELIST_DEFAULT, &m,
	+ MIN(n, cnt-i));
	+ if (n == 0)
	+ break;
	+ if (!vm_domain_allocate(vmd, VM_ALLOC_NORMAL, n)) {
	+ vm_phys_free_contig(m, n);
	+ break;
	+ }
	+ for (j = 0; j < n; j++)
	+ store[i+j] = m++;
	+ }
	+ vm_domain_free_unlock(vmd);
	+
	+ return (i);
	+}
	+
	+static void
	+vm_page_release(void arg, void *store, int cnt)
	+{
	+ struct vm_domain *vmd;
	+ vm_page_t m;
	+ int i;
	+
	+ vmd = arg;
	+ vm_domain_free_lock(vmd);
	+ for (i = 0; i < cnt; i++) {
	+ m = (vm_page_t)store[i];
	+ vm_phys_free_pages(m, 0);
	+ }
	+ vm_domain_free_unlock(vmd);
	+ vm_domain_freecnt_inc(vmd, cnt);
	+}
	+
	#define VPSC_ANY 0 /* No restrictions. */
	#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */
	#define VPSC_NOSUPER 2 /* Skip superpages. */
	@@ -2274,7 +2351,8 @@
	vm_reserv_size(level)) - pa);
	#endif
	} else if (object->memattr == VM_MEMATTR_DEFAULT &&
	- m->queue != PQ_NONE && !vm_page_busied(m)) {
	+ m->queue != PQ_NONE &&
	+ (m->aflags & PGA_DEQUEUE) == 0 && !vm_page_busied(m)) {
	/*
	* The page is allocated but eligible for
	* relocation. Extend the current run by one
	@@ -2425,7 +2503,9 @@
	error = EINVAL;
	else if (object->memattr != VM_MEMATTR_DEFAULT)
	error = EINVAL;
	- else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
	+ else if (m->queue != PQ_NONE &&
	+ (m->aflags & PGA_DEQUEUE) == 0 &&
	+ !vm_page_busied(m)) {
	KASSERT(pmap_page_get_memattr(m) ==
	VM_MEMATTR_DEFAULT,
	("page %p has an unexpected memattr", m));
	@@ -2485,7 +2565,8 @@
	*/
	if (object->ref_count != 0)
	pmap_remove_all(m);
	- m_new->aflags = m->aflags;
	+ m_new->aflags = m->aflags &
	+ ~PGA_QUEUE_STATE_MASK;
	KASSERT(m_new->oflags == VPO_UNMANAGED,
	("page %p is managed", m_new));
	m_new->oflags = m->oflags & VPO_NOSYNC;
	@@ -2497,7 +2578,7 @@
	vm_page_remque(m);
	vm_page_replace_checked(m_new, object,
	m->pindex, m);
	- if (vm_page_free_prep(m, false))
	+ if (vm_page_free_prep(m))
	SLIST_INSERT_HEAD(&free, m,
	plinks.s.ss);

	@@ -2511,7 +2592,7 @@
	m->flags &= ~PG_ZERO;
	vm_page_remque(m);
	vm_page_remove(m);
	- if (vm_page_free_prep(m, false))
	+ if (vm_page_free_prep(m))
	SLIST_INSERT_HEAD(&free, m,
	plinks.s.ss);
	KASSERT(m->dirty == 0,
	@@ -2954,113 +3035,289 @@
	return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]);
	}

	+static struct mtx *
	+vm_page_pagequeue_lockptr(vm_page_t m)
	+{
	+
	+ if (m->queue == PQ_NONE)
	+ return (NULL);
	+ return (&vm_page_pagequeue(m)->pq_mutex);
	+}
	+
	+static void
	+vm_pqbatch_process(struct vm_pagequeue pq, struct vm_batchqueue bq,
	+ uint8_t queue)
	+{
	+ vm_page_t m;
	+ int delta, i;
	+ uint8_t aflags;
	+
	+ vm_pagequeue_assert_locked(pq);
	+
	+ delta = 0;
	+ for (i = 0; i < bq->bq_cnt; i++) {
	+ m = bq->bq_pa[i];
	+ if (__predict_false(m->queue != queue))
	+ continue;
	+
	+ aflags = m->aflags;
	+ if ((aflags & PGA_DEQUEUE) != 0) {
	+ if (__predict_true((aflags & PGA_ENQUEUED) != 0)) {
	+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	+ delta--;
	+ }
	+
	+ /*
	+ * Synchronize with the page daemon, which may be
	+ * simultaneously scanning this page with only the page
	+ * lock held. We must be careful to avoid leaving the
	+ * page in a state where it appears to belong to a page
	+ * queue.
	+ */
	+ m->queue = PQ_NONE;
	+ atomic_thread_fence_rel();
	+ vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
	+ } else if ((aflags & PGA_ENQUEUED) == 0) {
	+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	+ delta++;
	+ vm_page_aflag_set(m, PGA_ENQUEUED);
	+ if ((aflags & PGA_REQUEUE) != 0)
	+ vm_page_aflag_clear(m, PGA_REQUEUE);
	+ } else if ((aflags & PGA_REQUEUE) != 0) {
	+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	+ vm_page_aflag_clear(m, PGA_REQUEUE);
	+ }
	+ }
	+ vm_batchqueue_init(bq);
	+ vm_pagequeue_cnt_add(pq, delta);
	+}
	+
	/*
	- * vm_page_dequeue:
	+ * vm_page_dequeue_lazy: [ internal use only ]
	*
	- * Remove the given page from its current page queue.
	+ * Request removal of the given page from its current page
	+ * queue. Physical removal from the queue may be deferred
	+ * arbitrarily, and may be cancelled by later queue operations
	+ * on that page.
	*
	* The page must be locked.
	*/
	void
	-vm_page_dequeue(vm_page_t m)
	+vm_page_dequeue_lazy(vm_page_t m)
	{
	+ struct vm_batchqueue *bq;
	struct vm_pagequeue *pq;
	+ int domain, queue;

	vm_page_assert_locked(m);
	- KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
	- m));
	- pq = vm_page_pagequeue(m);
	- vm_pagequeue_lock(pq);
	- m->queue = PQ_NONE;
	- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	- vm_pagequeue_cnt_dec(pq);
	+
	+ queue = m->queue;
	+ if (queue == PQ_NONE)
	+ return;
	+ domain = vm_phys_domain(m);
	+ pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
	+
	+ vm_page_aflag_set(m, PGA_DEQUEUE);
	+
	+ critical_enter();
	+ bq = DPCPU_PTR(pqbatch[domain][queue]);
	+ if (vm_batchqueue_insert(bq, m)) {
	+ critical_exit();
	+ return;
	+ }
	+ if (!vm_pagequeue_trylock(pq)) {
	+ critical_exit();
	+ vm_pagequeue_lock(pq);
	+ critical_enter();
	+ bq = DPCPU_PTR(pqbatch[domain][queue]);
	+ }
	+ vm_pqbatch_process(pq, bq, queue);
	+
	+ /*
	+ * The page may have been dequeued by another thread before we
	+ * acquired the page queue lock. However, since we hold the
	+ * page lock, the page's queue field cannot change a second
	+ * time and we can safely clear PGA_DEQUEUE.
	+ */
	+ KASSERT(m->queue == queue \|\| m->queue == PQ_NONE,
	+ ("%s: page %p migrated between queues", __func__, m));
	+ if (m->queue == queue) {
	+ (void)vm_batchqueue_insert(bq, m);
	+ vm_pqbatch_process(pq, bq, queue);
	+ } else
	+ vm_page_aflag_clear(m, PGA_DEQUEUE);
	vm_pagequeue_unlock(pq);
	+ critical_exit();
	}

	/*
	* vm_page_dequeue_locked:
	*
	- * Remove the given page from its current page queue.
	+ * Remove the page from its page queue, which must be locked.
	+ * If the page lock is not held, there is no guarantee that the
	+ * page will not be enqueued by another thread before this function
	+ * returns. In this case, it is up to the caller to ensure that
	+ * no other threads hold a reference to the page.
	*
	- * The page and page queue must be locked.
	+ * The page queue lock must be held. If the page is not already
	+ * logically dequeued, the page lock must be held as well.
	*/
	void
	vm_page_dequeue_locked(vm_page_t m)
	{
	struct vm_pagequeue *pq;

	- vm_page_lock_assert(m, MA_OWNED);
	- pq = vm_page_pagequeue(m);
	- vm_pagequeue_assert_locked(pq);
	+ KASSERT(m->queue != PQ_NONE,
	+ ("%s: page %p queue field is PQ_NONE", __func__, m));
	+ vm_pagequeue_assert_locked(vm_page_pagequeue(m));
	+ KASSERT((m->aflags & PGA_DEQUEUE) != 0 \|\|
	+ mtx_owned(vm_page_lockptr(m)),
	+ ("%s: queued unlocked page %p", __func__, m));
	+
	+ if ((m->aflags & PGA_ENQUEUED) != 0) {
	+ pq = vm_page_pagequeue(m);
	+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	+ vm_pagequeue_cnt_dec(pq);
	+ }
	+
	+ /*
	+ * Synchronize with the page daemon, which may be simultaneously
	+ * scanning this page with only the page lock held. We must be careful
	+ * to avoid leaving the page in a state where it appears to belong to a
	+ * page queue.
	+ */
	m->queue = PQ_NONE;
	- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	- vm_pagequeue_cnt_dec(pq);
	+ atomic_thread_fence_rel();
	+ vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
	+}
	+
	+/*
	+ * vm_page_dequeue:
	+ *
	+ * Remove the page from whichever page queue it's in, if any.
	+ * If the page lock is not held, there is no guarantee that the
	+ * page will not be enqueued by another thread before this function
	+ * returns. In this case, it is up to the caller to ensure that
	+ * no other threads hold a reference to the page.
	+ */
	+void
	+vm_page_dequeue(vm_page_t m)
	+{
	+ struct mtx lock, lock1;
	+
	+ lock = vm_page_pagequeue_lockptr(m);
	+ for (;;) {
	+ if (lock == NULL)
	+ return;
	+ mtx_lock(lock);
	+ if ((lock1 = vm_page_pagequeue_lockptr(m)) == lock)
	+ break;
	+ mtx_unlock(lock);
	+ lock = lock1;
	+ }
	+ KASSERT(lock == vm_page_pagequeue_lockptr(m),
	+ ("%s: page %p migrated directly between queues", __func__, m));
	+ vm_page_dequeue_locked(m);
	+ mtx_unlock(lock);
	}

	/*
	- * vm_page_enqueue:
	+ * vm_page_enqueue_lazy:
	*
	- * Add the given page to the specified page queue.
	+ * Schedule the given page for insertion into the specified page queue.
	+ * Physical insertion of the page may be deferred indefinitely.
	*
	* The page must be locked.
	*/
	static void
	-vm_page_enqueue(uint8_t queue, vm_page_t m)
	+vm_page_enqueue_lazy(vm_page_t m, uint8_t queue)
	{
	+ struct vm_batchqueue *bq;
	struct vm_pagequeue *pq;
	+ int domain;

	- vm_page_lock_assert(m, MA_OWNED);
	- KASSERT(queue < PQ_COUNT,
	- ("vm_page_enqueue: invalid queue %u request for page %p",
	- queue, m));
	+ vm_page_assert_locked(m);
	+ KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0,
	+ ("%s: page %p is already enqueued", __func__, m));
	+
	+ domain = vm_phys_domain(m);
	pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
	- vm_pagequeue_lock(pq);
	+
	+ /*
	+ * The queue field might be changed back to PQ_NONE by a concurrent
	+ * call to vm_page_dequeue(). In that case the batch queue entry will
	+ * be a no-op.
	+ */
	m->queue = queue;
	- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	- vm_pagequeue_cnt_inc(pq);
	+
	+ critical_enter();
	+ bq = DPCPU_PTR(pqbatch[domain][queue]);
	+ if (__predict_true(vm_batchqueue_insert(bq, m))) {
	+ critical_exit();
	+ return;
	+ }
	+ if (!vm_pagequeue_trylock(pq)) {
	+ critical_exit();
	+ vm_pagequeue_lock(pq);
	+ critical_enter();
	+ bq = DPCPU_PTR(pqbatch[domain][queue]);
	+ }
	+ vm_pqbatch_process(pq, bq, queue);
	+ (void)vm_batchqueue_insert(bq, m);
	+ vm_pqbatch_process(pq, bq, queue);
	vm_pagequeue_unlock(pq);
	+ critical_exit();
	}

	/*
	* vm_page_requeue:
	*
	- * Move the given page to the tail of its current page queue.
	+ * Schedule a requeue of the given page.
	*
	* The page must be locked.
	*/
	void
	vm_page_requeue(vm_page_t m)
	{
	+ struct vm_batchqueue *bq;
	struct vm_pagequeue *pq;
	+ int domain, queue;

	vm_page_lock_assert(m, MA_OWNED);
	KASSERT(m->queue != PQ_NONE,
	- ("vm_page_requeue: page %p is not queued", m));
	+ ("%s: page %p is not enqueued", __func__, m));
	+
	+ domain = vm_phys_domain(m);
	+ queue = m->queue;
	pq = vm_page_pagequeue(m);
	- vm_pagequeue_lock(pq);
	- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	- vm_pagequeue_unlock(pq);
	-}

	-/*
	- * vm_page_requeue_locked:
	- *
	- * Move the given page to the tail of its current page queue.
	- *
	- * The page queue must be locked.
	- */
	-void
	-vm_page_requeue_locked(vm_page_t m)
	-{
	- struct vm_pagequeue *pq;
	+ if (queue == PQ_NONE)
	+ return;

	- KASSERT(m->queue != PQ_NONE,
	- ("vm_page_requeue_locked: page %p is not queued", m));
	- pq = vm_page_pagequeue(m);
	- vm_pagequeue_assert_locked(pq);
	- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	+ vm_page_aflag_set(m, PGA_REQUEUE);
	+ critical_enter();
	+ bq = DPCPU_PTR(pqbatch[domain][queue]);
	+ if (__predict_true(vm_batchqueue_insert(bq, m))) {
	+ critical_exit();
	+ return;
	+ }
	+ if (!vm_pagequeue_trylock(pq)) {
	+ critical_exit();
	+ vm_pagequeue_lock(pq);
	+ critical_enter();
	+ bq = DPCPU_PTR(pqbatch[domain][queue]);
	+ }
	+ vm_pqbatch_process(pq, bq, queue);
	+ KASSERT(m->queue == queue \|\| m->queue == PQ_NONE,
	+ ("%s: page %p migrated between queues", __func__, m));
	+ if (m->queue == queue) {
	+ (void)vm_batchqueue_insert(bq, m);
	+ vm_pqbatch_process(pq, bq, queue);
	+ } else
	+ vm_page_aflag_clear(m, PGA_REQUEUE);
	+ vm_pagequeue_unlock(pq);
	+ critical_exit();
	}

	/*
	@@ -3078,18 +3335,18 @@
	int queue;

	vm_page_lock_assert(m, MA_OWNED);
	- if ((queue = m->queue) != PQ_ACTIVE) {
	- if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
	- if (m->act_count < ACT_INIT)
	- m->act_count = ACT_INIT;
	- if (queue != PQ_NONE)
	- vm_page_dequeue(m);
	- vm_page_enqueue(PQ_ACTIVE, m);
	- }
	- } else {
	- if (m->act_count < ACT_INIT)
	+
	+ if ((queue = m->queue) == PQ_ACTIVE \|\| m->wire_count > 0 \|\|
	+ (m->oflags & VPO_UNMANAGED) != 0) {
	+ if (queue == PQ_ACTIVE && m->act_count < ACT_INIT)
	m->act_count = ACT_INIT;
	+ return;
	}
	+
	+ vm_page_remque(m);
	+ if (m->act_count < ACT_INIT)
	+ m->act_count = ACT_INIT;
	+ vm_page_enqueue_lazy(m, PQ_ACTIVE);
	}

	/*
	@@ -3100,11 +3357,10 @@
	* the page to the free list only if this function returns true.
	*
	* The object must be locked. The page must be locked if it is
	- * managed. For a queued managed page, the pagequeue_locked
	- * argument specifies whether the page queue is already locked.
	+ * managed.
	*/
	bool
	-vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
	+vm_page_free_prep(vm_page_t m)
	{

	#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
	@@ -3120,14 +3376,14 @@
	if ((m->oflags & VPO_UNMANAGED) == 0) {
	vm_page_lock_assert(m, MA_OWNED);
	KASSERT(!pmap_page_is_mapped(m),
	- ("vm_page_free_toq: freeing mapped page %p", m));
	+ ("vm_page_free_prep: freeing mapped page %p", m));
	} else
	KASSERT(m->queue == PQ_NONE,
	- ("vm_page_free_toq: unmanaged page %p is queued", m));
	+ ("vm_page_free_prep: unmanaged page %p is queued", m));
	VM_CNT_INC(v_tfree);

	if (vm_page_sbusied(m))
	- panic("vm_page_free: freeing busy page %p", m);
	+ panic("vm_page_free_prep: freeing busy page %p", m);

	vm_page_remove(m);

	@@ -3143,21 +3399,23 @@
	return (false);
	}

	- if (m->queue != PQ_NONE) {
	- if (pagequeue_locked)
	- vm_page_dequeue_locked(m);
	- else
	- vm_page_dequeue(m);
	- }
	+ /*
	+ * Pages need not be dequeued before they are returned to the physical
	+ * memory allocator, but they must at least be marked for a deferred
	+ * dequeue.
	+ */
	+ if ((m->oflags & VPO_UNMANAGED) == 0)
	+ vm_page_dequeue_lazy(m);
	+
	m->valid = 0;
	vm_page_undirty(m);

	if (m->wire_count != 0)
	- panic("vm_page_free: freeing wired page %p", m);
	+ panic("vm_page_free_prep: freeing wired page %p", m);
	if (m->hold_count != 0) {
	m->flags &= ~PG_ZERO;
	KASSERT((m->flags & PG_UNHOLDFREE) == 0,
	- ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
	+ ("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m));
	m->flags \|= PG_UNHOLDFREE;
	return (false);
	}
	@@ -3176,36 +3434,6 @@
	return (true);
	}

	-void
	-vm_page_free_phys_pglist(struct pglist *tq)
	-{
	- struct vm_domain *vmd;
	- vm_page_t m;
	- int cnt;
	-
	- if (TAILQ_EMPTY(tq))
	- return;
	- vmd = NULL;
	- cnt = 0;
	- TAILQ_FOREACH(m, tq, listq) {
	- if (vmd != vm_pagequeue_domain(m)) {
	- if (vmd != NULL) {
	- vm_domain_free_unlock(vmd);
	- vm_domain_freecnt_inc(vmd, cnt);
	- cnt = 0;
	- }
	- vmd = vm_pagequeue_domain(m);
	- vm_domain_free_lock(vmd);
	- }
	- vm_phys_free_pages(m, 0);
	- cnt++;
	- }
	- if (vmd != NULL) {
	- vm_domain_free_unlock(vmd);
	- vm_domain_freecnt_inc(vmd, cnt);
	- }
	-}
	-
	/*
	* vm_page_free_toq:
	*
	@@ -3220,8 +3448,9 @@
	{
	struct vm_domain *vmd;

	- if (!vm_page_free_prep(m, false))
	+ if (!vm_page_free_prep(m))
	return;
	+
	vmd = vm_pagequeue_domain(m);
	vm_domain_free_lock(vmd);
	vm_phys_free_pages(m, 0);
	@@ -3243,23 +3472,18 @@
	vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
	{
	vm_page_t m;
	- struct pglist pgl;
	int count;

	if (SLIST_EMPTY(free))
	return;

	count = 0;
	- TAILQ_INIT(&pgl);
	while ((m = SLIST_FIRST(free)) != NULL) {
	count++;
	SLIST_REMOVE_HEAD(free, plinks.s.ss);
	- if (vm_page_free_prep(m, false))
	- TAILQ_INSERT_TAIL(&pgl, m, listq);
	+ vm_page_free_toq(m);
	}

	- vm_page_free_phys_pglist(&pgl);
	-
	if (update_wire_count)
	vm_wire_sub(count);
	}
	@@ -3318,22 +3542,25 @@
	KASSERT(queue < PQ_COUNT \|\| queue == PQ_NONE,
	("vm_page_unwire: invalid queue %u request for page %p",
	queue, m));
	+ if ((m->oflags & VPO_UNMANAGED) == 0)
	+ vm_page_assert_locked(m);

	unwired = vm_page_unwire_noq(m);
	- if (unwired && (m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL) {
	- if (m->queue == queue) {
	+ if (!unwired \|\| (m->oflags & VPO_UNMANAGED) != 0 \|\| m->object == NULL)
	+ return (unwired);
	+
	+ if (m->queue == queue) {
	+ if (queue == PQ_ACTIVE)
	+ vm_page_reference(m);
	+ else if (queue != PQ_NONE)
	+ vm_page_requeue(m);
	+ } else {
	+ vm_page_dequeue(m);
	+ if (queue != PQ_NONE) {
	+ vm_page_enqueue_lazy(m, queue);
	if (queue == PQ_ACTIVE)
	- vm_page_reference(m);
	- else if (queue != PQ_NONE)
	- vm_page_requeue(m);
	- } else {
	- vm_page_remque(m);
	- if (queue != PQ_NONE) {
	- vm_page_enqueue(queue, m);
	- if (queue == PQ_ACTIVE)
	- /* Initialize act_count. */
	- vm_page_activate(m);
	- }
	+ /* Initialize act_count. */
	+ vm_page_activate(m);
	}
	}
	return (unwired);
	@@ -3369,73 +3596,85 @@
	}

	/*
	- * Move the specified page to the inactive queue, or requeue the page if it is
	- * already in the inactive queue.
	- *
	- * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
	- * queue. However, setting "noreuse" to TRUE will accelerate the specified
	- * page's reclamation, but it will not unmap the page from any address space.
	- * This is implemented by inserting the page near the head of the inactive
	- * queue, using a marker page to guide FIFO insertion ordering.
	+ * Move the specified page to the tail of the inactive queue, or requeue
	+ * the page if it is already in the inactive queue.
	*
	* The page must be locked.
	*/
	-static inline void
	-_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
	+void
	+vm_page_deactivate(vm_page_t m)
	{
	- struct vm_pagequeue *pq;
	- int queue;

	vm_page_assert_locked(m);

	- if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
	- pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE];
	- /* Avoid multiple acquisitions of the inactive queue lock. */
	- queue = m->queue;
	- if (queue == PQ_INACTIVE) {
	- vm_pagequeue_lock(pq);
	- vm_page_dequeue_locked(m);
	- } else {
	- if (queue != PQ_NONE)
	- vm_page_dequeue(m);
	- vm_pagequeue_lock(pq);
	- }
	- m->queue = PQ_INACTIVE;
	- if (noreuse)
	- TAILQ_INSERT_BEFORE(
	- &vm_pagequeue_domain(m)->vmd_inacthead, m,
	- plinks.q);
	- else
	- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	- vm_pagequeue_cnt_inc(pq);
	- vm_pagequeue_unlock(pq);
	- }
	-}
	-
	-/*
	- * Move the specified page to the inactive queue, or requeue the page if it is
	- * already in the inactive queue.
	- *
	- * The page must be locked.
	- */
	-void
	-vm_page_deactivate(vm_page_t m)
	-{
	+ if (m->wire_count > 0 \|\| (m->oflags & VPO_UNMANAGED) != 0)
	+ return;

	- _vm_page_deactivate(m, FALSE);
	+ if (!vm_page_inactive(m)) {
	+ vm_page_remque(m);
	+ vm_page_enqueue_lazy(m, PQ_INACTIVE);
	+ } else
	+ vm_page_requeue(m);
	}

	/*
	- * Move the specified page to the inactive queue with the expectation
	- * that it is unlikely to be reused.
	+ * Move the specified page close to the head of the inactive queue,
	+ * bypassing LRU. A marker page is used to maintain FIFO ordering.
	+ * As with regular enqueues, we use a per-CPU batch queue to reduce
	+ * contention on the page queue lock.
	*
	* The page must be locked.
	*/
	void
	vm_page_deactivate_noreuse(vm_page_t m)
	{
	+ struct vm_batchqueue *bq;
	+ struct vm_domain *vmd;
	+ struct vm_pagequeue *pq;
	+ vm_page_t marker;
	+ int domain;
	+
	+ vm_page_assert_locked(m);
	+
	+ if (m->wire_count > 0 \|\| (m->oflags & VPO_UNMANAGED) != 0)
	+ return;
	+
	+ domain = vm_phys_domain(m);
	+ vmd = VM_DOMAIN(domain);
	+ pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
	+
	+ if (!vm_page_inactive(m))
	+ vm_page_remque(m);

	- _vm_page_deactivate(m, TRUE);
	+ m->queue = PQ_INACTIVE;
	+
	+ critical_enter();
	+ bq = DPCPU_PTR(noreuseq[domain]);
	+ if (vm_batchqueue_insert(bq, m)) {
	+ critical_exit();
	+ return;
	+ }
	+ if (!vm_pagequeue_trylock(pq)) {
	+ critical_exit();
	+ vm_pagequeue_lock(pq);
	+ critical_enter();
	+ bq = DPCPU_PTR(noreuseq[domain]);
	+ }
	+ marker = &vmd->vmd_inacthead;
	+ do {
	+ if (m->queue != PQ_INACTIVE)
	+ continue;
	+ if ((m->aflags & PGA_ENQUEUED) != 0)
	+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	+ else {
	+ vm_page_aflag_set(m, PGA_ENQUEUED);
	+ vm_pagequeue_cnt_inc(pq);
	+ }
	+ TAILQ_INSERT_BEFORE(marker, m, plinks.q);
	+ } while ((m = vm_batchqueue_pop(bq)) != NULL);
	+ vm_pagequeue_unlock(pq);
	+ vm_batchqueue_init(bq);
	+ critical_exit();
	}

	/*
	@@ -3448,13 +3687,14 @@
	{

	vm_page_assert_locked(m);
	- if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
	- if (m->queue == PQ_LAUNDRY)
	- vm_page_requeue(m);
	- else {
	- vm_page_remque(m);
	- vm_page_enqueue(PQ_LAUNDRY, m);
	- }
	+ if (m->wire_count > 0 \|\| (m->oflags & VPO_UNMANAGED) != 0)
	+ return;
	+
	+ if (m->queue == PQ_LAUNDRY)
	+ vm_page_requeue(m);
	+ else {
	+ vm_page_remque(m);
	+ vm_page_enqueue_lazy(m, PQ_LAUNDRY);
	}
	}

	@@ -3470,9 +3710,9 @@
	vm_page_assert_locked(m);
	KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0,
	("page %p already unswappable", m));
	- if (m->queue != PQ_NONE)
	- vm_page_dequeue(m);
	- vm_page_enqueue(PQ_UNSWAPPABLE, m);
	+
	+ vm_page_remque(m);
	+ vm_page_enqueue_lazy(m, PQ_UNSWAPPABLE);
	}

	/*
	Index: sys/vm/vm_pageout.c
	===================================================================
	--- sys/vm/vm_pageout.c
	+++ sys/vm/vm_pageout.c
	@@ -201,11 +201,17 @@
	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");

	static u_int isqrt(u_int num);
	-static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
	static int vm_pageout_launder(struct vm_domain *vmd, int launder,
	bool in_shortfall);
	static void vm_pageout_laundry_worker(void *arg);
	-static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
	+
	+struct scan_state {
	+ struct vm_batchqueue bq;
	+ struct vm_pagequeue *pq;
	+ vm_page_t marker;
	+ int maxscan;
	+ int scanned;
	+};

	/*
	* Initialize a dummy page for marking the caller's place in the specified
	@@ -225,96 +231,157 @@
	}

	/*
	- * vm_pageout_fallback_object_lock:
	- *
	- * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
	- * known to have failed and page queue must be either PQ_ACTIVE or
	- * PQ_INACTIVE. To avoid lock order violation, unlock the page queue
	- * while locking the vm object. Use marker page to detect page queue
	- * changes and maintain notion of next page on page queue. Return
	- * TRUE if no changes were detected, FALSE otherwise. vm object is
	- * locked on return.
	- *
	- * This function depends on both the lock portion of struct vm_object
	- * and normal struct vm_page being type stable.
	+ * Initialize and enqueue static queue markers.
	*/
	-static boolean_t
	-vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
	+static void
	+vm_pageout_insert_markers(struct vm_domain *vmd)
	{
	- struct vm_page marker;
	- struct vm_pagequeue *pq;
	- boolean_t unchanged;
	- u_short queue;
	- vm_object_t object;
	+ vm_page_t marker;
	+ int i;

	- queue = m->queue;
	- vm_pageout_init_marker(&marker, queue);
	- pq = vm_page_pagequeue(m);
	- object = m->object;
	-
	- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
	- vm_pagequeue_unlock(pq);
	- vm_page_unlock(m);
	- VM_OBJECT_WLOCK(object);
	- vm_page_lock(m);
	- vm_pagequeue_lock(pq);
	+ /*
	+ * inacthead is used to provide FIFO ordering for LRU-bypassing
	+ * insertions.
	+ */
	+ marker = &vmd->vmd_inacthead;
	+ vm_pageout_init_marker(marker, PQ_INACTIVE);
	+ TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, marker,
	+ plinks.q);
	+ vm_page_aflag_set(marker, PGA_ENQUEUED);

	/*
	- * The page's object might have changed, and/or the page might
	- * have moved from its original position in the queue. If the
	- * page's object has changed, then the caller should abandon
	- * processing the page because the wrong object lock was
	- * acquired. Use the marker's plinks.q, not the page's, to
	- * determine if the page has been moved. The state of the
	- * page's plinks.q can be indeterminate; whereas, the marker's
	- * plinks.q must be valid.
	+ * The clock pages are used to implement active queue scanning without
	+ * requeues. Scans start at clock[0], which is advanced after the scan
	+ * ends. When the two clock hands meet, they are reset and scanning
	+ * resumes from the head of the queue.
	*/
	- *next = TAILQ_NEXT(&marker, plinks.q);
	- unchanged = m->object == object &&
	- m == TAILQ_PREV(&marker, pglist, plinks.q);
	- KASSERT(!unchanged \|\| m->queue == queue,
	- ("page %p queue %d %d", m, queue, m->queue));
	- TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
	- return (unchanged);
	+ marker = &vmd->vmd_clock[0];
	+ vm_pageout_init_marker(marker, PQ_ACTIVE);
	+ TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, marker,
	+ plinks.q);
	+ vm_page_aflag_set(marker, PGA_ENQUEUED);
	+ marker = &vmd->vmd_clock[1];
	+ vm_pageout_init_marker(marker, PQ_ACTIVE);
	+ TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, marker,
	+ plinks.q);
	+ vm_page_aflag_set(marker, PGA_ENQUEUED);
	+
	+ for (i = 0; i < PQ_COUNT; i++)
	+ vm_pageout_init_marker(&vmd->vmd_markers[i], i);
	+}
	+
	+static void
	+vm_pageout_init_scan(struct scan_state ss, struct vm_pagequeue pq,
	+ vm_page_t marker, int maxscan)
	+{
	+
	+ vm_pagequeue_assert_locked(pq);
	+
	+ if ((marker->aflags & PGA_ENQUEUED) == 0) {
	+ TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
	+ vm_page_aflag_set(marker, PGA_ENQUEUED);
	+ }
	+ vm_batchqueue_init(&ss->bq);
	+ ss->pq = pq;
	+ ss->marker = marker;
	+ ss->maxscan = maxscan;
	+ ss->scanned = 0;
	+ vm_pagequeue_unlock(pq);
	+}
	+
	+static void
	+vm_pageout_end_scan(struct scan_state *ss)
	+{
	+ struct vm_pagequeue *pq;
	+
	+ pq = ss->pq;
	+ vm_pagequeue_assert_locked(pq);
	+ KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
	+ ("marker %p not enqueued", ss->marker));
	+
	+ if ((ss->marker->aflags & PGA_ENQUEUED) != 0) {
	+ TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
	+ vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
	+ }
	+ VM_CNT_ADD(v_pdpages, ss->scanned);
	+}
	+
	+static inline bool
	+vm_pageout_page_queued(vm_page_t m, int queue)
	+{
	+
	+ vm_page_assert_locked(m);
	+
	+ if ((m->aflags & PGA_DEQUEUE) != 0)
	+ return (false);
	+ atomic_thread_fence_acq();
	+ return (m->queue == queue);
	}

	/*
	- * Lock the page while holding the page queue lock. Use marker page
	- * to detect page queue changes and maintain notion of next page on
	- * page queue. Return TRUE if no changes were detected, FALSE
	- * otherwise. The page is locked on return. The page queue lock might
	- * be dropped and reacquired.
	+ * Add a small number of queued pages to a batch queue for later processing
	+ * without the corresponding queue lock held. The caller must have enqueued a
	+ * marker page at the desired start point for the scan.
	*
	- * This function depends on normal struct vm_page being type stable.
	+ * When processing the batch queue, vm_pageout_page_queued() must be used to
	+ * determine whether the page was logically dequeued by another thread. Once
	+ * this check is performed, the page lock guarantees that the page will not be
	+ * disassociated from the queue.
	*/
	-static boolean_t
	-vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
	+static inline void
	+vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
	{
	- struct vm_page marker;
	struct vm_pagequeue *pq;
	- boolean_t unchanged;
	- u_short queue;
	+ vm_page_t m, marker;

	- vm_page_lock_assert(m, MA_NOTOWNED);
	- if (vm_page_trylock(m))
	- return (TRUE);
	+ KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
	+ ("marker %p not enqueued", ss->marker));

	- queue = m->queue;
	- vm_pageout_init_marker(&marker, queue);
	- pq = vm_page_pagequeue(m);
	+ marker = ss->marker;
	+ pq = ss->pq;

	- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
	- vm_pagequeue_unlock(pq);
	- vm_page_lock(m);
	vm_pagequeue_lock(pq);
	+ for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
	+ ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
	+ m = TAILQ_NEXT(m, plinks.q), ss->scanned++) {
	+ if ((m->flags & PG_MARKER) == 0) {
	+ KASSERT((m->aflags & PGA_ENQUEUED) != 0,
	+ ("page %p not enqueued", m));
	+ KASSERT((m->flags & PG_FICTITIOUS) == 0,
	+ ("Fictitious page %p cannot be in page queue", m));
	+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	+ ("Unmanaged page %p cannot be in page queue", m));
	+ } else if (dequeue)
	+ continue;
	+
	+ (void)vm_batchqueue_insert(&ss->bq, m);
	+ if (dequeue) {
	+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
	+ vm_page_aflag_clear(m, PGA_ENQUEUED);
	+ }
	+ }
	+ TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
	+ if (__predict_true(m != NULL))
	+ TAILQ_INSERT_BEFORE(m, marker, plinks.q);
	+ else
	+ TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
	+ if (dequeue)
	+ vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
	+ vm_pagequeue_unlock(pq);
	+}
	+
	+/* XXX */
	+static inline vm_page_t
	+vm_pageout_next(struct scan_state *ss, const bool dequeue)
	+{
	+ vm_page_t m;

	- /* Page queue might have changed. */
	- *next = TAILQ_NEXT(&marker, plinks.q);
	- unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
	- KASSERT(!unchanged \|\| m->queue == queue,
	- ("page %p queue %d %d", m, queue, m->queue));
	- TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
	- return (unchanged);
	+ m = vm_batchqueue_pop(&ss->bq);
	+ if (m == NULL) {
	+ vm_pageout_collect_batch(ss, dequeue);
	+ m = vm_batchqueue_pop(&ss->bq);
	+ }
	+ return (m);
	}

	/*
	@@ -370,12 +437,12 @@
	break;
	}
	vm_page_test_dirty(p);
	- if (p->dirty == 0) {
	+ if (p->dirty == 0 \|\| !vm_page_in_laundry(p)) {
	ib = 0;
	break;
	}
	vm_page_lock(p);
	- if (!vm_page_in_laundry(p) \|\| vm_page_held(p)) {
	+ if (vm_page_held(p)) {
	vm_page_unlock(p);
	ib = 0;
	break;
	@@ -398,10 +465,10 @@
	if ((p = vm_page_next(ps)) == NULL \|\| vm_page_busied(p))
	break;
	vm_page_test_dirty(p);
	- if (p->dirty == 0)
	+ if (p->dirty == 0 \|\| !vm_page_in_laundry(p))
	break;
	vm_page_lock(p);
	- if (!vm_page_in_laundry(p) \|\| vm_page_held(p)) {
	+ if (vm_page_held(p)) {
	vm_page_unlock(p);
	break;
	}
	@@ -692,13 +759,18 @@
	static int
	vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
	{
	+ struct scan_state ss;
	struct vm_pagequeue *pq;
	+ struct mtx *mtx;
	vm_object_t object;
	- vm_page_t m, next;
	- int act_delta, error, maxscan, numpagedout, starting_target;
	+ vm_page_t m, marker;
	+ int act_delta, error, numpagedout, queue, starting_target;
	int vnodes_skipped;
	- bool pageout_ok, queue_locked;
	+ bool obj_locked, pageout_ok;

	+ mtx = NULL;
	+ obj_locked = false;
	+ object = NULL;
	starting_target = launder;
	vnodes_skipped = 0;

	@@ -711,61 +783,78 @@
	* maxscan ensures that we don't re-examine requeued pages. Any
	* additional pages written as part of a cluster are subtracted from
	* maxscan since they must be taken from the laundry queue.
	+ * XXX
	*
	* As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
	* swap devices are configured.
	*/
	if (atomic_load_acq_int(&swapdev_enabled))
	- pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE];
	+ queue = PQ_UNSWAPPABLE;
	else
	- pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
	+ queue = PQ_LAUNDRY;
	+ marker = &vmd->vmd_markers[queue];
	+ pq = &vmd->vmd_pagequeues[queue];

	scan:
	vm_pagequeue_lock(pq);
	- maxscan = pq->pq_cnt;
	- queue_locked = true;
	- for (m = TAILQ_FIRST(&pq->pq_pl);
	- m != NULL && maxscan-- > 0 && launder > 0;
	- m = next) {
	- vm_pagequeue_assert_locked(pq);
	- KASSERT(queue_locked, ("unlocked laundry queue"));
	- KASSERT(vm_page_in_laundry(m),
	- ("page %p has an inconsistent queue", m));
	- next = TAILQ_NEXT(m, plinks.q);
	- if ((m->flags & PG_MARKER) != 0)
	+ vm_pageout_init_scan(&ss, pq, marker, pq->pq_cnt);
	+ while ((m = vm_pageout_next(&ss, false)) != NULL) {
	+ if (__predict_false((m->flags & PG_MARKER) != 0))
	continue;
	- KASSERT((m->flags & PG_FICTITIOUS) == 0,
	- ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
	- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	- ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
	- if (!vm_pageout_page_lock(m, &next) \|\| m->hold_count != 0) {
	- vm_page_unlock(m);
	+
	+ vm_page_change_lock(m, &mtx);
	+
	+recheck:
	+ /*
	+ * The page may have been disassociated from the queue
	+ * while locks were dropped.
	+ */
	+ if (!vm_pageout_page_queued(m, queue))
	+ continue;
	+
	+ /*
	+ * A requeue was requested, so this page gets a second
	+ * chance.
	+ */
	+ if ((m->aflags & PGA_REQUEUE) != 0) {
	+ vm_page_requeue(m);
	continue;
	}
	+
	+ /*
	+ * Held pages are essentially stuck in the queue.
	+ *
	+ * Wired pages may not be freed. Complete their removal
	+ * from the queue now to avoid needless revisits during
	+ * future scans.
	+ */
	+ if (m->hold_count != 0)
	+ continue;
	if (m->wire_count != 0) {
	- vm_page_dequeue_locked(m);
	- vm_page_unlock(m);
	+ vm_page_dequeue_lazy(m);
	continue;
	}
	- object = m->object;
	- if ((!VM_OBJECT_TRYWLOCK(object) &&
	- (!vm_pageout_fallback_object_lock(m, &next) \|\|
	- vm_page_held(m))) \|\| vm_page_busied(m)) {
	- VM_OBJECT_WUNLOCK(object);
	- if (m->wire_count != 0 && vm_page_pagequeue(m) == pq)
	- vm_page_dequeue_locked(m);
	- vm_page_unlock(m);
	- continue;
	+
	+ if (object != m->object) {
	+ if (obj_locked) {
	+ VM_OBJECT_WUNLOCK(object);
	+ obj_locked = false;
	+ }
	+ object = m->object;
	+ }
	+ if (!obj_locked) {
	+ if (!VM_OBJECT_TRYWLOCK(object)) {
	+ mtx_unlock(mtx);
	+ VM_OBJECT_WLOCK(object);
	+ obj_locked = true;
	+ mtx_lock(mtx);
	+ goto recheck;
	+ } else
	+ obj_locked = true;
	}

	- /*
	- * Unlock the laundry queue, invalidating the 'next' pointer.
	- * Use a marker to remember our place in the laundry queue.
	- */
	- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
	- plinks.q);
	- vm_pagequeue_unlock(pq);
	- queue_locked = false;
	+ if (vm_page_busied(m))
	+ continue;

	/*
	* Invalid pages can be easily freed. They cannot be
	@@ -814,9 +903,11 @@
	*/
	if (!in_shortfall)
	launder--;
	- goto drop_page;
	- } else if ((object->flags & OBJ_DEAD) == 0)
	- goto requeue_page;
	+ continue;
	+ } else if ((object->flags & OBJ_DEAD) == 0) {
	+ vm_page_requeue(m);
	+ continue;
	+ }
	}

	/*
	@@ -851,11 +942,8 @@
	else
	pageout_ok = true;
	if (!pageout_ok) {
	-requeue_page:
	- vm_pagequeue_lock(pq);
	- queue_locked = true;
	- vm_page_requeue_locked(m);
	- goto drop_page;
	+ vm_page_requeue(m);
	+ continue;
	}

	/*
	@@ -874,28 +962,31 @@
	error = vm_pageout_clean(m, &numpagedout);
	if (error == 0) {
	launder -= numpagedout;
	- maxscan -= numpagedout - 1;
	+ ss.scanned += numpagedout;
	} else if (error == EDEADLK) {
	pageout_lock_miss++;
	vnodes_skipped++;
	}
	- goto relock_queue;
	+ mtx = NULL;
	+ obj_locked = false;
	}
	-drop_page:
	- vm_page_unlock(m);
	+ }
	+ if (mtx != NULL) {
	+ mtx_unlock(mtx);
	+ mtx = NULL;
	+ }
	+ if (obj_locked) {
	VM_OBJECT_WUNLOCK(object);
	-relock_queue:
	- if (!queue_locked) {
	- vm_pagequeue_lock(pq);
	- queue_locked = true;
	- }
	- next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
	- TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
	+ obj_locked = false;
	}
	+ vm_pagequeue_lock(pq);
	+ vm_pageout_end_scan(&ss);
	vm_pagequeue_unlock(pq);

	if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) {
	- pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
	+ queue = PQ_LAUNDRY;
	+ marker = &vmd->vmd_markers[queue];
	+ pq = &vmd->vmd_pagequeues[queue];
	goto scan;
	}

	@@ -951,7 +1042,6 @@
	vmd = VM_DOMAIN(domain);
	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
	- vm_pageout_init_marker(&vmd->vmd_laundry_marker, PQ_LAUNDRY);

	shortfall = 0;
	in_shortfall = false;
	@@ -1091,6 +1181,68 @@
	}
	}

	+static int
	+vm_pageout_free_pages(vm_object_t object, vm_page_t m, struct mtx **mtxp)
	+{
	+ vm_page_t p, pp;
	+ vm_pindex_t start;
	+ int pcount, count;
	+
	+ pcount = MAX(object->iosize / PAGE_SIZE, 1);
	+ count = 1;
	+ if (pcount == 1) {
	+ vm_page_free(m);
	+ goto out;
	+ }
	+
	+ /* Find the first page in the block. */
	+ start = m->pindex - (m->pindex % pcount);
	+ for (p = m; p->pindex > start && (pp = vm_page_prev(p)) != NULL;
	+ p = pp);
	+
	+ /* Free the original page so we don't validate it twice. */
	+ if (p == m)
	+ p = vm_page_next(m);
	+ vm_page_free(m);
	+ /* Iterate through the block range and free compatible pages. */
	+ for (m = p; m != NULL; m = p) {
	+ /* Don't cache miss for the next page after the tail. */
	+ if (m->pindex < start + pcount)
	+ p = TAILQ_NEXT(m, listq);
	+ else
	+ p = NULL;
	+ vm_page_change_lock(m, mtxp);
	+ if (vm_page_held(m) \|\| vm_page_busied(m) \|\|
	+ m->queue != PQ_INACTIVE)
	+ continue;
	+ if (m->valid == 0)
	+ goto free_page;
	+ if ((m->aflags & PGA_REFERENCED) != 0)
	+ continue;
	+ if (object->ref_count != 0) {
	+ if (pmap_ts_referenced(m)) {
	+ vm_page_aflag_set(m, PGA_REFERENCED);
	+ continue;
	+ }
	+ vm_page_test_dirty(m);
	+ if (m->dirty == 0)
	+ pmap_remove_all(m);
	+ }
	+ if (m->dirty) {
	+ if ((object->flags & OBJ_DEAD) == 0)
	+ vm_page_launder(m);
	+ continue;
	+ }
	+free_page:
	+ vm_page_free(m);
	+ count++;
	+ }
	+out:
	+ VM_CNT_ADD(v_dfree, count);
	+
	+ return (count);
	+}
	+
	/*
	* vm_pageout_scan does the dirty work for the pageout daemon.
	*
	@@ -1103,13 +1255,16 @@
	static bool
	vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
	{
	- vm_page_t m, next;
	+ struct scan_state ss;
	+ struct vm_batchqueue rq;
	+ struct mtx *mtx;
	+ vm_page_t m, marker;
	struct vm_pagequeue *pq;
	vm_object_t object;
	long min_scan;
	- int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
	- int page_shortage, scan_tick, scanned, starting_page_shortage;
	- boolean_t queue_locked;
	+ int act_delta, addl_page_shortage, deficit, inactq_shortage;
	+ int page_shortage, scan_tick, starting_page_shortage;
	+ bool obj_locked;

	/*
	* If we need to reclaim memory ask kernel caches to return
	@@ -1150,78 +1305,82 @@
	page_shortage = deficit = 0;
	starting_page_shortage = page_shortage;

	+ mtx = NULL;
	+ obj_locked = false;
	+ object = NULL;
	+ vm_batchqueue_init(&rq);
	+
	/*
	* Start scanning the inactive queue for pages that we can free. The
	* scan will stop when we reach the target or we have scanned the
	* entire queue. (Note that m->act_count is not used to make
	* decisions for the inactive queue, only for the active queue.)
	*/
	+ marker = &vmd->vmd_markers[PQ_INACTIVE];
	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
	- maxscan = pq->pq_cnt;
	vm_pagequeue_lock(pq);
	- queue_locked = TRUE;
	- for (m = TAILQ_FIRST(&pq->pq_pl);
	- m != NULL && maxscan-- > 0 && page_shortage > 0;
	- m = next) {
	- vm_pagequeue_assert_locked(pq);
	- KASSERT(queue_locked, ("unlocked inactive queue"));
	- KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
	+ vm_pageout_init_scan(&ss, pq, marker, min(pq->pq_cnt, page_shortage));
	+ while ((m = vm_pageout_next(&ss, true)) != NULL) {
	+ if (__predict_false((m->flags & PG_MARKER) != 0))
	+ continue;

	- VM_CNT_INC(v_pdpages);
	- next = TAILQ_NEXT(m, plinks.q);
	+ vm_page_change_lock(m, &mtx);

	+recheck:
	/*
	- * skip marker pages
	+ * The page may have been disassociated from the queue
	+ * while locks were dropped.
	*/
	- if (m->flags & PG_MARKER)
	+ if (!vm_pageout_page_queued(m, PQ_INACTIVE)) {
	+ addl_page_shortage++;
	continue;
	+ }

	- KASSERT((m->flags & PG_FICTITIOUS) == 0,
	- ("Fictitious page %p cannot be in inactive queue", m));
	- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	- ("Unmanaged page %p cannot be in inactive queue", m));
	+ /*
	+ * A requeue was requested, so this page gets a second
	+ * chance.
	+ */
	+ if ((m->aflags & PGA_REQUEUE) != 0)
	+ goto reinsert;

	/*
	- * The page or object lock acquisitions fail if the
	- * page was removed from the queue or moved to a
	- * different position within the queue. In either
	- * case, addl_page_shortage should not be incremented.
	+ * Held pages are essentially stuck in the queue. So,
	+ * they ought to be discounted from the inactive count.
	+ * See the calculation of inactq_shortage before the
	+ * loop over the active queue below.
	+ *
	+ * Wired pages may not be freed. Complete their removal
	+ * from the queue now to avoid needless revisits during
	+ * future scans.
	*/
	- if (!vm_pageout_page_lock(m, &next))
	- goto unlock_page;
	- else if (m->wire_count != 0) {
	- /*
	- * Wired pages may not be freed, and unwiring a queued
	- * page will cause it to be requeued. Thus, remove them
	- * from the queue now to avoid unnecessary revisits.
	- */
	- vm_page_dequeue_locked(m);
	+ if (m->hold_count != 0) {
	addl_page_shortage++;
	- goto unlock_page;
	- } else if (m->hold_count != 0) {
	- /*
	- * Held pages are essentially stuck in the
	- * queue. So, they ought to be discounted
	- * from the inactive count. See the
	- * calculation of inactq_shortage before the
	- * loop over the active queue below.
	- */
	+ goto reinsert;
	+ }
	+ if (m->wire_count != 0) {
	addl_page_shortage++;
	- goto unlock_page;
	+ vm_page_dequeue_lazy(m);
	+ continue;
	}
	- object = m->object;
	- if (!VM_OBJECT_TRYWLOCK(object)) {
	- if (!vm_pageout_fallback_object_lock(m, &next))
	- goto unlock_object;
	- else if (m->wire_count != 0) {
	- vm_page_dequeue_locked(m);
	- addl_page_shortage++;
	- goto unlock_object;
	- } else if (m->hold_count != 0) {
	- addl_page_shortage++;
	- goto unlock_object;
	+
	+ if (object != m->object) {
	+ if (obj_locked) {
	+ VM_OBJECT_WUNLOCK(object);
	+ obj_locked = false;
	}
	+ object = m->object;
	}
	+ if (!obj_locked) {
	+ if (!VM_OBJECT_TRYWLOCK(object)) {
	+ mtx_unlock(mtx);
	+ VM_OBJECT_WLOCK(object);
	+ obj_locked = true;
	+ mtx_lock(mtx);
	+ goto recheck;
	+ } else
	+ obj_locked = true;
	+ }
	+
	if (vm_page_busied(m)) {
	/*
	* Don't mess with busy pages. Leave them at
	@@ -1232,26 +1391,8 @@
	* inactive count.
	*/
	addl_page_shortage++;
	-unlock_object:
	- VM_OBJECT_WUNLOCK(object);
	-unlock_page:
	- vm_page_unlock(m);
	- continue;
	+ goto reinsert;
	}
	- KASSERT(!vm_page_held(m), ("Held page %p", m));
	-
	- /*
	- * Dequeue the inactive page and unlock the inactive page
	- * queue, invalidating the 'next' pointer. Dequeueing the
	- * page here avoids a later reacquisition (and release) of
	- * the inactive page queue lock when vm_page_activate(),
	- * vm_page_free(), or vm_page_launder() is called. Use a
	- * marker to remember our place in the inactive queue.
	- */
	- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
	- vm_page_dequeue_locked(m);
	- vm_pagequeue_unlock(pq);
	- queue_locked = FALSE;

	/*
	* Invalid pages can be easily freed. They cannot be
	@@ -1289,14 +1430,10 @@
	* queue.
	*/
	m->act_count += act_delta + ACT_ADVANCE;
	- goto drop_page;
	+ continue;
	} else if ((object->flags & OBJ_DEAD) == 0) {
	- vm_pagequeue_lock(pq);
	- queue_locked = TRUE;
	- m->queue = PQ_INACTIVE;
	- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
	- vm_pagequeue_cnt_inc(pq);
	- goto drop_page;
	+ vm_page_aflag_set(m, PGA_REQUEUE);
	+ goto reinsert;
	}
	}

	@@ -1322,21 +1459,49 @@
	*/
	if (m->dirty == 0) {
	free_page:
	- vm_page_free(m);
	- VM_CNT_INC(v_dfree);
	- --page_shortage;
	+ page_shortage -= vm_pageout_free_pages(object,
	+ m, &mtx);
	} else if ((object->flags & OBJ_DEAD) == 0)
	vm_page_launder(m);
	-drop_page:
	- vm_page_unlock(m);
	- VM_OBJECT_WUNLOCK(object);
	- if (!queue_locked) {
	+ continue;
	+
	+ /*
	+ * Re-add stuck pages to the queue. We will examine them again
	+ * during the next scan. If the queue state of a page has
	+ * changed since it was physically removed from the page queue,
	+ * don't do anything with that page.
	+ */
	+reinsert:
	+ if (!vm_batchqueue_insert(&rq, m)) {
	vm_pagequeue_lock(pq);
	- queue_locked = TRUE;
	+ do {
	+ if (!vm_page_inactive(m) \|\|
	+ (m->aflags & PGA_ENQUEUED) != 0)
	+ continue;
	+ vm_page_aflag_set(m, PGA_ENQUEUED);
	+ if ((m->aflags & PGA_REQUEUE) != 0) {
	+ TAILQ_INSERT_TAIL(&pq->pq_pl, m,
	+ plinks.q);
	+ vm_page_aflag_clear(m, PGA_REQUEUE);
	+ } else
	+ TAILQ_INSERT_BEFORE(ss.marker, m,
	+ plinks.q);
	+ vm_pagequeue_cnt_inc(pq);
	+ } while ((m = vm_batchqueue_pop(&rq)) != NULL);
	+ vm_pagequeue_unlock(pq);
	+ vm_batchqueue_init(&rq);
	}
	- next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
	- TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
	}
	+ if (mtx != NULL) {
	+ mtx_unlock(mtx);
	+ mtx = NULL;
	+ }
	+ if (obj_locked) {
	+ VM_OBJECT_WUNLOCK(object);
	+ obj_locked = false;
	+ }
	+ vm_pagequeue_lock(pq);
	+ vm_pageout_end_scan(&ss);
	vm_pagequeue_unlock(pq);

	/*
	@@ -1399,9 +1564,9 @@
	vm_paging_target(vmd) + deficit + addl_page_shortage;
	inactq_shortage *= act_scan_laundry_weight;

	+ marker = &vmd->vmd_markers[PQ_ACTIVE];
	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
	vm_pagequeue_lock(pq);
	- maxscan = pq->pq_cnt;

	/*
	* If we're just idle polling attempt to visit every
	@@ -1414,7 +1579,7 @@
	min_scan /= hz * vm_pageout_update_period;
	} else
	min_scan = 0;
	- if (min_scan > 0 \|\| (inactq_shortage > 0 && maxscan > 0))
	+ if (min_scan > 0 \|\| (inactq_shortage > 0 && pq->pq_cnt > 0))
	vmd->vmd_last_active_scan = scan_tick;

	/*
	@@ -1422,35 +1587,40 @@
	* the per-page activity counter and use it to identify deactivation
	* candidates. Held pages may be deactivated.
	*/
	- for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
	- min_scan \|\| (inactq_shortage > 0 && scanned < maxscan)); m = next,
	- scanned++) {
	- KASSERT(m->queue == PQ_ACTIVE,
	- ("vm_pageout_scan: page %p isn't active", m));
	- next = TAILQ_NEXT(m, plinks.q);
	- if ((m->flags & PG_MARKER) != 0)
	- continue;
	- KASSERT((m->flags & PG_FICTITIOUS) == 0,
	- ("Fictitious page %p cannot be in active queue", m));
	- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
	- ("Unmanaged page %p cannot be in active queue", m));
	- if (!vm_pageout_page_lock(m, &next)) {
	- vm_page_unlock(m);
	- continue;
	+act_scan:
	+ TAILQ_INSERT_AFTER(&pq->pq_pl, &vmd->vmd_clock[0], marker, plinks.q);
	+ vm_page_aflag_set(marker, PGA_ENQUEUED);
	+ vm_pageout_init_scan(&ss, pq, marker, inactq_shortage > 0 ?
	+ pq->pq_cnt : min_scan);
	+ while ((m = vm_pageout_next(&ss, false)) != NULL) {
	+ if (__predict_false(m == &vmd->vmd_clock[1])) {
	+ vm_pagequeue_lock(pq);
	+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
	+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
	+ TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
	+ plinks.q);
	+ TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
	+ plinks.q);
	+ vm_pageout_end_scan(&ss);
	+ goto act_scan;
	}
	+ if (__predict_false((m->flags & PG_MARKER) != 0))
	+ continue;
	+
	+ vm_page_change_lock(m, &mtx);

	/*
	- * The count for page daemon pages is updated after checking
	- * the page for eligibility.
	+ * The page may have been disassociated from the queue
	+ * while locks were dropped.
	*/
	- VM_CNT_INC(v_pdpages);
	+ if (!vm_pageout_page_queued(m, PQ_ACTIVE))
	+ continue;

	/*
	* Wired pages are dequeued lazily.
	*/
	if (m->wire_count != 0) {
	- vm_page_dequeue_locked(m);
	- vm_page_unlock(m);
	+ vm_page_dequeue_lazy(m);
	continue;
	}

	@@ -1494,9 +1664,6 @@
	* queue depending on usage.
	*/
	if (m->act_count == 0) {
	- /* Dequeue to avoid later lock recursion. */
	- vm_page_dequeue_locked(m);
	-
	/*
	* When not short for inactive pages, let dirty pages go
	* through the inactive queue before moving to the
	@@ -1528,11 +1695,19 @@
	inactq_shortage--;
	}
	}
	- } else
	- vm_page_requeue_locked(m);
	- vm_page_unlock(m);
	+ }
	}
	+ if (mtx != NULL) {
	+ mtx_unlock(mtx);
	+ mtx = NULL;
	+ }
	+
	+ vm_pagequeue_lock(pq);
	+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
	+ TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
	+ vm_pageout_end_scan(&ss);
	vm_pagequeue_unlock(pq);
	+
	if (pass > 0)
	vm_swapout_run_idle();
	return (page_shortage <= 0);
	@@ -1779,10 +1954,8 @@

	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
	vmd->vmd_last_active_scan = ticks;
	- vm_pageout_init_marker(&vmd->vmd_marker, PQ_INACTIVE);
	- vm_pageout_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE);
	- TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
	- &vmd->vmd_inacthead, plinks.q);
	+
	+ vm_pageout_insert_markers(vmd);

	/*
	* The pageout daemon worker is never done, so loop forever.
	Index: sys/vm/vm_pagequeue.h
	===================================================================
	--- sys/vm/vm_pagequeue.h
	+++ sys/vm/vm_pagequeue.h
	@@ -73,7 +73,18 @@
	const char * const pq_name;
	} __aligned(CACHE_LINE_SIZE);

	+#ifndef VM_BATCHQUEUE_SIZE
	+#define VM_BATCHQUEUE_SIZE 7
	+#endif
	+
	+struct vm_batchqueue {
	+ vm_page_t bq_pa[VM_BATCHQUEUE_SIZE];
	+ int bq_cnt;
	+} __aligned(CACHE_LINE_SIZE);
	+
	+#include <vm/uma.h>
	#include <sys/pidctrl.h>
	+
	struct sysctl_oid;

	/*
	@@ -81,17 +92,18 @@
	* and accounting.
	*
	* Lock Key:
	- * f vmd_free_mtx
	- * p vmd_pageout_mtx
	- * d vm_domainset_lock
	- * a atomic
	- * c const after boot
	- * q page queue lock
	+ * f vmd_free_mtx
	+ * p vmd_pageout_mtx
	+ * d vm_domainset_lock
	+ * a atomic
	+ * c const after boot
	+ * q page queue lock
	*/
	struct vm_domain {
	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
	struct mtx_padalign vmd_free_mtx;
	struct mtx_padalign vmd_pageout_mtx;
	+ uma_zone_t vmd_pgcache; /* (c) per-domain page free cache. */
	struct vmem vmd_kernel_arena; / (c) per-domain kva arena. */
	u_int vmd_domain; /* (c) Domain number. */
	u_int vmd_page_count; /* (c) Total page count. */
	@@ -105,9 +117,9 @@
	boolean_t vmd_oom;
	int vmd_oom_seq;
	int vmd_last_active_scan;
	- struct vm_page vmd_laundry_marker;
	- struct vm_page vmd_marker; /* marker for pagedaemon private use */
	+ struct vm_page vmd_markers[PQ_COUNT];
	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
	+ struct vm_page vmd_clock[2]; /* markers for active queue scan */

	int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */
	int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */
	@@ -143,6 +155,7 @@
	#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
	#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
	#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
	+#define vm_pagequeue_trylock(pq) mtx_trylock(&(pq)->pq_mutex)
	#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)

	#define vm_domain_free_assert_locked(n) \
	@@ -153,6 +166,8 @@
	mtx_lock(vm_domain_free_lockptr((d)))
	#define vm_domain_free_lockptr(d) \
	(&(d)->vmd_free_mtx)
	+#define vm_domain_free_trylock(d) \
	+ mtx_trylock(vm_domain_free_lockptr((d)))
	#define vm_domain_free_unlock(d) \
	mtx_unlock(vm_domain_free_lockptr((d)))

	@@ -171,14 +186,39 @@
	vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
	{

	-#ifdef notyet
	vm_pagequeue_assert_locked(pq);
	-#endif
	pq->pq_cnt += addend;
	}
	#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
	#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)

	+static inline void
	+vm_batchqueue_init(struct vm_batchqueue *bq)
	+{
	+
	+ bq->bq_cnt = 0;
	+}
	+
	+static inline bool
	+vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m)
	+{
	+
	+ if (bq->bq_cnt < nitems(bq->bq_pa)) {
	+ bq->bq_pa[bq->bq_cnt++] = m;
	+ return (true);
	+ }
	+ return (false);
	+}
	+
	+static inline vm_page_t
	+vm_batchqueue_pop(struct vm_batchqueue *bq)
	+{
	+
	+ if (bq->bq_cnt == 0)
	+ return (NULL);
	+ return (bq->bq_pa[--bq->bq_cnt]);
	+}
	+
	void vm_domain_set(struct vm_domain *vmd);
	void vm_domain_clear(struct vm_domain *vmd);
	int vm_domain_allocate(struct vm_domain *vmd, int req, int npages);
	Index: sys/vm/vm_phys.h
	===================================================================
	--- sys/vm/vm_phys.h
	+++ sys/vm/vm_phys.h
	@@ -78,6 +78,7 @@
	vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool,
	int order);
	vm_page_t vm_phys_alloc_pages(int domain, int pool, int order);
	+int vm_phys_alloc_npages(int domain, int pool, vm_page_t *m, int cnt);
	int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high);
	int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
	vm_memattr_t memattr);
	Index: sys/vm/vm_phys.c
	===================================================================
	--- sys/vm/vm_phys.c
	+++ sys/vm/vm_phys.c
	@@ -354,9 +354,9 @@

	m->order = order;
	if (tail)
	- TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
	+ TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
	else
	- TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
	+ TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
	fl[order].lcnt++;
	}

	@@ -364,7 +364,7 @@
	vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
	{

	- TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
	+ TAILQ_REMOVE(&fl[order].pl, m, listq);
	fl[order].lcnt--;
	m->order = VM_NFREEORDER;
	}
	@@ -624,6 +624,26 @@
	return (NULL);
	}

	+int
	+vm_phys_alloc_npages(int domain, int pool, vm_page_t *mp, int cnt)
	+{
	+ vm_page_t m;
	+ int order, freelist;
	+
	+ for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
	+ for (order = fls(cnt) -1; order >= 0; order--) {
	+ m = vm_phys_alloc_freelist_pages(domain, freelist,
	+ pool, order);
	+ if (m != NULL) {
	+ *mp = m;
	+ return (1 << order);
	+ }
	+ }
	+ }
	+ *mp = NULL;
	+ return (0);
	+}
	+
	/*
	* Allocate a contiguous, power of two-sized set of physical pages from the
	* specified free list. The free list must be specified using one of the
	@@ -1176,7 +1196,7 @@
	oind++) {
	for (pind = 0; pind < VM_NFREEPOOL; pind++) {
	fl = (*seg->free_queues)[pind];
	- TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
	+ TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
	/*
	* Is the size of this allocation request
	* larger than the largest block size?
	Index: sys/vm/vm_reserv.c
	===================================================================
	--- sys/vm/vm_reserv.c
	+++ sys/vm/vm_reserv.c
	@@ -419,7 +419,7 @@
	index));
	KASSERT(rv->popcnt > 0,
	("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
	- KASSERT(rv->domain < vm_ndomains,
	+ KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
	("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
	rv, rv->domain));
	if (rv->popcnt == VM_LEVEL_0_NPAGES) {
	@@ -531,7 +531,7 @@
	("vm_reserv_populate: reserv %p is already full", rv));
	KASSERT(rv->pages->psind == 0,
	("vm_reserv_populate: reserv %p is already promoted", rv));
	- KASSERT(rv->domain < vm_ndomains,
	+ KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
	("vm_reserv_populate: reserv %p's domain is corrupted %d",
	rv, rv->domain));
	popmap_set(rv->popmap, index);
	@@ -1218,7 +1218,7 @@
	vm_reserv_domain_lock(rv->domain);
	KASSERT(rv->inpartpopq,
	("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
	- KASSERT(rv->domain < vm_ndomains,
	+ KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
	("vm_reserv_reclaim: reserv %p's domain is corrupted %d",
	rv, rv->domain));
	TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
	Index: sys/vm/vnode_pager.c
	===================================================================
	--- sys/vm/vnode_pager.c
	+++ sys/vm/vnode_pager.c
	@@ -249,6 +249,7 @@

	object->un_pager.vnp.vnp_size = size;
	object->un_pager.vnp.writemappings = 0;
	+ object->iosize = vp->v_mount->mnt_stat.f_iosize;
	object->domain.dr_policy = vnode_domainset;

	object->handle = handle;
	@@ -769,7 +770,7 @@

	object = vp->v_object;
	foff = IDX_TO_OFF(m[0]->pindex);
	- bsize = vp->v_mount->mnt_stat.f_iosize;
	+ bsize = object->iosize;
	pagesperblock = bsize / PAGE_SIZE;

	KASSERT(foff < object->un_pager.vnp.vnp_size,

File Metadata

Mime Type: text/plain
Expires: Wed, Nov 27, 8:02 AM (16 h, 20 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 14881479
Default Alt Text: D14893.id40974.diff (68 KB)

D14893.id40974.diffNo OneTemporaryActions

D14893.id40974.diffView Options

File Metadata

Event Timeline

D14893.id40974.diff
No OneTemporary
Actions

D14893.id40974.diff
View Options