Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F103543851
D14893.id40974.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
68 KB
Referenced Files
None
Subscribers
None
D14893.id40974.diff
View Options
Index: sys/amd64/include/vmparam.h
===================================================================
--- sys/amd64/include/vmparam.h
+++ sys/amd64/include/vmparam.h
@@ -227,4 +227,10 @@
#define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */
+/*
+ * Use a fairly large batch size since we expect amd64 systems to have lots of
+ * memory.
+ */
+#define VM_BATCHQUEUE_SIZE 31
+
#endif /* _MACHINE_VMPARAM_H_ */
Index: sys/kern/subr_witness.c
===================================================================
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -601,7 +601,6 @@
* CDEV
*/
{ "vm map (system)", &lock_class_mtx_sleep },
- { "vm pagequeue", &lock_class_mtx_sleep },
{ "vnode interlock", &lock_class_mtx_sleep },
{ "cdev", &lock_class_mtx_sleep },
{ NULL, NULL },
@@ -611,11 +610,11 @@
{ "vm map (user)", &lock_class_sx },
{ "vm object", &lock_class_rw },
{ "vm page", &lock_class_mtx_sleep },
- { "vm pagequeue", &lock_class_mtx_sleep },
{ "pmap pv global", &lock_class_rw },
{ "pmap", &lock_class_mtx_sleep },
{ "pmap pv list", &lock_class_rw },
{ "vm page free queue", &lock_class_mtx_sleep },
+ { "vm pagequeue", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* kqueue/VFS interaction
Index: sys/vm/vm_object.h
===================================================================
--- sys/vm/vm_object.h
+++ sys/vm/vm_object.h
@@ -111,6 +111,7 @@
objtype_t type; /* type of pager */
u_short flags; /* see below */
u_short pg_color; /* (c) color of first page in obj */
+ u_int iosize; /* (c) Natural I/O size in bytes. */
u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */
int resident_page_count; /* number of resident pages */
struct vm_object *backing_object; /* object that I'm a shadow of */
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -282,6 +282,7 @@
object->handle = NULL;
object->backing_object = NULL;
object->backing_object_offset = (vm_ooffset_t) 0;
+ object->iosize = 0;
#if VM_NRESERVLEVEL > 0
LIST_INIT(&object->rvq);
#endif
@@ -720,14 +721,11 @@
vm_object_terminate_pages(vm_object_t object)
{
vm_page_t p, p_next;
- struct mtx *mtx, *mtx1;
- struct vm_pagequeue *pq, *pq1;
- int dequeued;
+ struct mtx *mtx;
VM_OBJECT_ASSERT_WLOCKED(object);
mtx = NULL;
- pq = NULL;
/*
* Free any remaining pageable pages. This also removes them from the
@@ -737,60 +735,23 @@
*/
TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
vm_page_assert_unbusied(p);
- if ((object->flags & OBJ_UNMANAGED) == 0) {
+ if ((object->flags & OBJ_UNMANAGED) == 0)
/*
* vm_page_free_prep() only needs the page
* lock for managed pages.
*/
- mtx1 = vm_page_lockptr(p);
- if (mtx1 != mtx) {
- if (mtx != NULL)
- mtx_unlock(mtx);
- if (pq != NULL) {
- vm_pagequeue_cnt_add(pq, dequeued);
- vm_pagequeue_unlock(pq);
- pq = NULL;
- }
- mtx = mtx1;
- mtx_lock(mtx);
- }
- }
+ vm_page_change_lock(p, &mtx);
p->object = NULL;
if (p->wire_count != 0)
- goto unlist;
+ continue;
VM_CNT_INC(v_pfree);
p->flags &= ~PG_ZERO;
- if (p->queue != PQ_NONE) {
- KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
- "page %p is not queued", p));
- pq1 = vm_page_pagequeue(p);
- if (pq != pq1) {
- if (pq != NULL) {
- vm_pagequeue_cnt_add(pq, dequeued);
- vm_pagequeue_unlock(pq);
- }
- pq = pq1;
- vm_pagequeue_lock(pq);
- dequeued = 0;
- }
- p->queue = PQ_NONE;
- TAILQ_REMOVE(&pq->pq_pl, p, plinks.q);
- dequeued--;
- }
- if (vm_page_free_prep(p, true))
- continue;
-unlist:
- TAILQ_REMOVE(&object->memq, p, listq);
- }
- if (pq != NULL) {
- vm_pagequeue_cnt_add(pq, dequeued);
- vm_pagequeue_unlock(pq);
+
+ vm_page_free(p);
}
if (mtx != NULL)
mtx_unlock(mtx);
- vm_page_free_phys_pglist(&object->memq);
-
/*
* If the object contained any pages, then reset it to an empty state.
* None of the object's fields, including "resident_page_count", were
@@ -1973,7 +1934,6 @@
{
vm_page_t p, next;
struct mtx *mtx;
- struct pglist pgl;
VM_OBJECT_ASSERT_WLOCKED(object);
KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@@ -1982,7 +1942,6 @@
if (object->resident_page_count == 0)
return;
vm_object_pip_add(object, 1);
- TAILQ_INIT(&pgl);
again:
p = vm_page_find_least(object, start);
mtx = NULL;
@@ -2037,12 +1996,10 @@
if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
pmap_remove_all(p);
p->flags &= ~PG_ZERO;
- if (vm_page_free_prep(p, false))
- TAILQ_INSERT_TAIL(&pgl, p, listq);
+ vm_page_free(p);
}
if (mtx != NULL)
mtx_unlock(mtx);
- vm_page_free_phys_pglist(&pgl);
vm_object_pip_wakeup(object);
}
Index: sys/vm/vm_page.h
===================================================================
--- sys/vm/vm_page.h
+++ sys/vm/vm_page.h
@@ -94,7 +94,9 @@
* In general, operations on this structure's mutable fields are
* synchronized using either one of or a combination of the lock on the
* object that the page belongs to (O), the pool lock for the page (P),
- * or the lock for either the free or paging queue (Q). If a field is
+ * the per-domain lock for the free queues (F), or the page's queue
+ * lock (Q). The queue lock for a page depends on the value of its
+ * queue field and described in detail below. If a field is
* annotated below with two of these locks, then holding either lock is
* sufficient for read access, but both locks are required for write
* access. An annotation of (C) indicates that the field is immutable.
@@ -143,6 +145,28 @@
* causing the thread to block. vm_page_sleep_if_busy() can be used to
* sleep until the page's busy state changes, after which the caller
* must re-lookup the page and re-evaluate its state.
+ *
+ * The queue field is the index of the page queue containing the
+ * page, or PQ_NONE if the page is not enqueued. The queue lock of a
+ * page is the page queue lock corresponding to the page queue index,
+ * or the page lock (P) for the page. To modify the queue field, the
+ * queue lock for the old value of the field must be held. It is
+ * invalid for a page's queue field to transition between two distinct
+ * page queue indices. That is, when updating the queue field, either
+ * the new value or the old value must be PQ_NONE.
+ *
+ * To avoid contention on page queue locks, page queue operations
+ * (enqueue, dequeue, requeue) are batched using per-CPU queues.
+ * A deferred operation is requested by inserting an entry into a
+ * batch queue; the entry is simply a pointer to the page, and the
+ * request type is encoded in the page's aflags field using the values
+ * in PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is
+ * crucial to this scheme since the processing of entries in a given
+ * batch queue may be deferred indefinitely. In particular, a page
+ * may be freed before its pending batch queue entries have been
+ * processed. The page lock (P) must be held to schedule a batched
+ * queue operation, and the page queue lock must be held in order to
+ * process batch queue entries for the page queue.
*/
#if PAGE_SIZE == 4096
@@ -174,7 +198,7 @@
TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */
vm_object_t object; /* which object am I in (O,P) */
vm_pindex_t pindex; /* offset into object (O,P) */
- vm_paddr_t phys_addr; /* physical address of page */
+ vm_paddr_t phys_addr; /* physical address of page (C) */
struct md_page md; /* machine dependent stuff */
u_int wire_count; /* wired down maps refs (P) */
volatile u_int busy_lock; /* busy owners lock */
@@ -182,11 +206,11 @@
uint16_t flags; /* page PG_* flags (P) */
uint8_t aflags; /* access is atomic */
uint8_t oflags; /* page VPO_* flags (O) */
- uint8_t queue; /* page queue index (P,Q) */
+ uint8_t queue; /* page queue index (Q) */
int8_t psind; /* pagesizes[] index (O) */
int8_t segind; /* vm_phys segment index (C) */
- uint8_t order; /* index of the buddy queue */
- uint8_t pool; /* vm_phys freepool index (Q) */
+ uint8_t order; /* index of the buddy queue (F) */
+ uint8_t pool; /* vm_phys freepool index (F) */
u_char act_count; /* page usage count (P) */
/* NOTE that these must support one bit per DEV_BSIZE in a page */
/* so, on normal X86 kernels, they must be at least 8 bits wide */
@@ -314,10 +338,32 @@
*
* PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
* at least one executable mapping. It is not consumed by the MI VM layer.
+ *
+ * PGA_ENQUEUED is set and cleared when a page is inserted into or removed
+ * from a page queue, respectively. It determines whether the plinks.q field
+ * of the page is valid. To set or clear this flag, the queue lock for the
+ * page must be held: the page queue lock corresponding to the page's "queue"
+ * field if its value is not PQ_NONE, and the page lock otherwise.
+ *
+ * PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page
+ * queue, and cleared when the dequeue request is processed. A page may
+ * have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue
+ * is requested after the page is scheduled to be enqueued but before it is
+ * actually inserted into the page queue. The page lock must be held to set
+ * this flag, and the queue lock for the page must be held to clear it.
+ *
+ * PGA_REQUEUE is set when the page is scheduled to be requeued in its page
+ * queue. The page lock must be held to set this flag, and the queue lock
+ * for the page must be held to clear it.
*/
#define PGA_WRITEABLE 0x01 /* page may be mapped writeable */
#define PGA_REFERENCED 0x02 /* page has been referenced */
#define PGA_EXECUTABLE 0x04 /* page may be mapped executable */
+#define PGA_ENQUEUED 0x08 /* page is enqueued in a page queue */
+#define PGA_DEQUEUE 0x10 /* page is due to be dequeued */
+#define PGA_REQUEUE 0x20 /* page is due to be requeued */
+
+#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE)
/*
* Page flags. If changed at any other time than page allocation or
@@ -483,10 +529,10 @@
void vm_page_deactivate(vm_page_t);
void vm_page_deactivate_noreuse(vm_page_t);
void vm_page_dequeue(vm_page_t m);
+void vm_page_dequeue_lazy(vm_page_t m);
void vm_page_dequeue_locked(vm_page_t m);
vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
-void vm_page_free_phys_pglist(struct pglist *tq);
-bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
+bool vm_page_free_prep(vm_page_t m);
vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
Index: sys/vm/vm_page.c
===================================================================
--- sys/vm/vm_page.c
+++ sys/vm/vm_page.c
@@ -131,13 +131,11 @@
extern void uma_startup(void *, int);
extern int vmem_startup_count(void);
-/*
- * Associated with page of user-allocatable memory is a
- * page structure.
- */
-
struct vm_domain vm_dom[MAXMEMDOM];
+static DPCPU_DEFINE(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]);
+static DPCPU_DEFINE(struct vm_batchqueue, noreuseq[MAXMEMDOM]);
+
struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
struct mtx_padalign __exclusive_cache_line vm_domainset_lock;
@@ -176,7 +174,7 @@
static void vm_page_alloc_check(vm_page_t m);
static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_enqueue_lazy(vm_page_t m, uint8_t queue);
static void vm_page_init(void *dummy);
static int vm_page_insert_after(vm_page_t m, vm_object_t object,
vm_pindex_t pindex, vm_page_t mpred);
@@ -186,6 +184,9 @@
vm_page_t m_run, vm_paddr_t high);
static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object,
int req);
+static int vm_page_import(void *arg, void **store, int cnt, int domain,
+ int flags);
+static void vm_page_release(void *arg, void **store, int cnt);
SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL);
@@ -199,6 +200,33 @@
VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
}
+/*
+ * The cache page zone is initialized later since we need to be able to allocate
+ * pages before UMA is fully initialized.
+ */
+static void
+vm_page_init_cache_zones(void *dummy __unused)
+{
+ struct vm_domain *vmd;
+ int i;
+
+ for (i = 0; i < vm_ndomains; i++) {
+ vmd = VM_DOMAIN(i);
+ /*
+ * Don't allow the page cache to take up more than .25% of
+ * memory.
+ */
+ if (vmd->vmd_page_count / 400 < 256 * mp_ncpus)
+ continue;
+ vmd->vmd_pgcache = uma_zcache_create("vm pgcache",
+ sizeof(struct vm_page), NULL, NULL, NULL, NULL,
+ vm_page_import, vm_page_release, vmd,
+ /* UMA_ZONE_NOBUCKETCACHE |*/
+ UMA_ZONE_MAXBUCKET | UMA_ZONE_VM);
+ }
+}
+SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL);
+
/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
#if PAGE_SIZE == 32768
#ifdef CTASSERT
@@ -1786,6 +1814,7 @@
#if VM_NRESERVLEVEL > 0
found:
#endif
+ vm_page_dequeue(m);
vm_page_alloc_check(m);
/*
@@ -1982,8 +2011,10 @@
#if VM_NRESERVLEVEL > 0
found:
#endif
- for (m = m_ret; m < &m_ret[npages]; m++)
+ for (m = m_ret; m < &m_ret[npages]; m++) {
+ vm_page_dequeue(m);
vm_page_alloc_check(m);
+ }
/*
* Initialize the pages. Only the PG_ZERO flag is inherited.
@@ -2127,6 +2158,7 @@
goto again;
return (NULL);
}
+ vm_page_dequeue(m);
vm_page_alloc_check(m);
/*
@@ -2150,6 +2182,51 @@
return (m);
}
+static int
+vm_page_import(void *arg, void **store, int cnt, int domain, int flags)
+{
+ struct vm_domain *vmd;
+ vm_page_t m;
+ int i, j, n;
+
+ vmd = arg;
+ domain = vmd->vmd_domain;
+ n = 64; /* Starting stride. */
+ vm_domain_free_lock(vmd);
+ for (i = 0; i < cnt; i+=n) {
+ n = vm_phys_alloc_npages(domain, VM_FREELIST_DEFAULT, &m,
+ MIN(n, cnt-i));
+ if (n == 0)
+ break;
+ if (!vm_domain_allocate(vmd, VM_ALLOC_NORMAL, n)) {
+ vm_phys_free_contig(m, n);
+ break;
+ }
+ for (j = 0; j < n; j++)
+ store[i+j] = m++;
+ }
+ vm_domain_free_unlock(vmd);
+
+ return (i);
+}
+
+static void
+vm_page_release(void *arg, void **store, int cnt)
+{
+ struct vm_domain *vmd;
+ vm_page_t m;
+ int i;
+
+ vmd = arg;
+ vm_domain_free_lock(vmd);
+ for (i = 0; i < cnt; i++) {
+ m = (vm_page_t)store[i];
+ vm_phys_free_pages(m, 0);
+ }
+ vm_domain_free_unlock(vmd);
+ vm_domain_freecnt_inc(vmd, cnt);
+}
+
#define VPSC_ANY 0 /* No restrictions. */
#define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */
#define VPSC_NOSUPER 2 /* Skip superpages. */
@@ -2274,7 +2351,8 @@
vm_reserv_size(level)) - pa);
#endif
} else if (object->memattr == VM_MEMATTR_DEFAULT &&
- m->queue != PQ_NONE && !vm_page_busied(m)) {
+ m->queue != PQ_NONE &&
+ (m->aflags & PGA_DEQUEUE) == 0 && !vm_page_busied(m)) {
/*
* The page is allocated but eligible for
* relocation. Extend the current run by one
@@ -2425,7 +2503,9 @@
error = EINVAL;
else if (object->memattr != VM_MEMATTR_DEFAULT)
error = EINVAL;
- else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
+ else if (m->queue != PQ_NONE &&
+ (m->aflags & PGA_DEQUEUE) == 0 &&
+ !vm_page_busied(m)) {
KASSERT(pmap_page_get_memattr(m) ==
VM_MEMATTR_DEFAULT,
("page %p has an unexpected memattr", m));
@@ -2485,7 +2565,8 @@
*/
if (object->ref_count != 0)
pmap_remove_all(m);
- m_new->aflags = m->aflags;
+ m_new->aflags = m->aflags &
+ ~PGA_QUEUE_STATE_MASK;
KASSERT(m_new->oflags == VPO_UNMANAGED,
("page %p is managed", m_new));
m_new->oflags = m->oflags & VPO_NOSYNC;
@@ -2497,7 +2578,7 @@
vm_page_remque(m);
vm_page_replace_checked(m_new, object,
m->pindex, m);
- if (vm_page_free_prep(m, false))
+ if (vm_page_free_prep(m))
SLIST_INSERT_HEAD(&free, m,
plinks.s.ss);
@@ -2511,7 +2592,7 @@
m->flags &= ~PG_ZERO;
vm_page_remque(m);
vm_page_remove(m);
- if (vm_page_free_prep(m, false))
+ if (vm_page_free_prep(m))
SLIST_INSERT_HEAD(&free, m,
plinks.s.ss);
KASSERT(m->dirty == 0,
@@ -2954,113 +3035,289 @@
return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]);
}
+static struct mtx *
+vm_page_pagequeue_lockptr(vm_page_t m)
+{
+
+ if (m->queue == PQ_NONE)
+ return (NULL);
+ return (&vm_page_pagequeue(m)->pq_mutex);
+}
+
+static void
+vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq,
+ uint8_t queue)
+{
+ vm_page_t m;
+ int delta, i;
+ uint8_t aflags;
+
+ vm_pagequeue_assert_locked(pq);
+
+ delta = 0;
+ for (i = 0; i < bq->bq_cnt; i++) {
+ m = bq->bq_pa[i];
+ if (__predict_false(m->queue != queue))
+ continue;
+
+ aflags = m->aflags;
+ if ((aflags & PGA_DEQUEUE) != 0) {
+ if (__predict_true((aflags & PGA_ENQUEUED) != 0)) {
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ delta--;
+ }
+
+ /*
+ * Synchronize with the page daemon, which may be
+ * simultaneously scanning this page with only the page
+ * lock held. We must be careful to avoid leaving the
+ * page in a state where it appears to belong to a page
+ * queue.
+ */
+ m->queue = PQ_NONE;
+ atomic_thread_fence_rel();
+ vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
+ } else if ((aflags & PGA_ENQUEUED) == 0) {
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ delta++;
+ vm_page_aflag_set(m, PGA_ENQUEUED);
+ if ((aflags & PGA_REQUEUE) != 0)
+ vm_page_aflag_clear(m, PGA_REQUEUE);
+ } else if ((aflags & PGA_REQUEUE) != 0) {
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_page_aflag_clear(m, PGA_REQUEUE);
+ }
+ }
+ vm_batchqueue_init(bq);
+ vm_pagequeue_cnt_add(pq, delta);
+}
+
/*
- * vm_page_dequeue:
+ * vm_page_dequeue_lazy: [ internal use only ]
*
- * Remove the given page from its current page queue.
+ * Request removal of the given page from its current page
+ * queue. Physical removal from the queue may be deferred
+ * arbitrarily, and may be cancelled by later queue operations
+ * on that page.
*
* The page must be locked.
*/
void
-vm_page_dequeue(vm_page_t m)
+vm_page_dequeue_lazy(vm_page_t m)
{
+ struct vm_batchqueue *bq;
struct vm_pagequeue *pq;
+ int domain, queue;
vm_page_assert_locked(m);
- KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
- m));
- pq = vm_page_pagequeue(m);
- vm_pagequeue_lock(pq);
- m->queue = PQ_NONE;
- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
- vm_pagequeue_cnt_dec(pq);
+
+ queue = m->queue;
+ if (queue == PQ_NONE)
+ return;
+ domain = vm_phys_domain(m);
+ pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue];
+
+ vm_page_aflag_set(m, PGA_DEQUEUE);
+
+ critical_enter();
+ bq = DPCPU_PTR(pqbatch[domain][queue]);
+ if (vm_batchqueue_insert(bq, m)) {
+ critical_exit();
+ return;
+ }
+ if (!vm_pagequeue_trylock(pq)) {
+ critical_exit();
+ vm_pagequeue_lock(pq);
+ critical_enter();
+ bq = DPCPU_PTR(pqbatch[domain][queue]);
+ }
+ vm_pqbatch_process(pq, bq, queue);
+
+ /*
+ * The page may have been dequeued by another thread before we
+ * acquired the page queue lock. However, since we hold the
+ * page lock, the page's queue field cannot change a second
+ * time and we can safely clear PGA_DEQUEUE.
+ */
+ KASSERT(m->queue == queue || m->queue == PQ_NONE,
+ ("%s: page %p migrated between queues", __func__, m));
+ if (m->queue == queue) {
+ (void)vm_batchqueue_insert(bq, m);
+ vm_pqbatch_process(pq, bq, queue);
+ } else
+ vm_page_aflag_clear(m, PGA_DEQUEUE);
vm_pagequeue_unlock(pq);
+ critical_exit();
}
/*
* vm_page_dequeue_locked:
*
- * Remove the given page from its current page queue.
+ * Remove the page from its page queue, which must be locked.
+ * If the page lock is not held, there is no guarantee that the
+ * page will not be enqueued by another thread before this function
+ * returns. In this case, it is up to the caller to ensure that
+ * no other threads hold a reference to the page.
*
- * The page and page queue must be locked.
+ * The page queue lock must be held. If the page is not already
+ * logically dequeued, the page lock must be held as well.
*/
void
vm_page_dequeue_locked(vm_page_t m)
{
struct vm_pagequeue *pq;
- vm_page_lock_assert(m, MA_OWNED);
- pq = vm_page_pagequeue(m);
- vm_pagequeue_assert_locked(pq);
+ KASSERT(m->queue != PQ_NONE,
+ ("%s: page %p queue field is PQ_NONE", __func__, m));
+ vm_pagequeue_assert_locked(vm_page_pagequeue(m));
+ KASSERT((m->aflags & PGA_DEQUEUE) != 0 ||
+ mtx_owned(vm_page_lockptr(m)),
+ ("%s: queued unlocked page %p", __func__, m));
+
+ if ((m->aflags & PGA_ENQUEUED) != 0) {
+ pq = vm_page_pagequeue(m);
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_dec(pq);
+ }
+
+ /*
+ * Synchronize with the page daemon, which may be simultaneously
+ * scanning this page with only the page lock held. We must be careful
+ * to avoid leaving the page in a state where it appears to belong to a
+ * page queue.
+ */
m->queue = PQ_NONE;
- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
- vm_pagequeue_cnt_dec(pq);
+ atomic_thread_fence_rel();
+ vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK);
+}
+
+/*
+ * vm_page_dequeue:
+ *
+ * Remove the page from whichever page queue it's in, if any.
+ * If the page lock is not held, there is no guarantee that the
+ * page will not be enqueued by another thread before this function
+ * returns. In this case, it is up to the caller to ensure that
+ * no other threads hold a reference to the page.
+ */
+void
+vm_page_dequeue(vm_page_t m)
+{
+ struct mtx *lock, *lock1;
+
+ lock = vm_page_pagequeue_lockptr(m);
+ for (;;) {
+ if (lock == NULL)
+ return;
+ mtx_lock(lock);
+ if ((lock1 = vm_page_pagequeue_lockptr(m)) == lock)
+ break;
+ mtx_unlock(lock);
+ lock = lock1;
+ }
+ KASSERT(lock == vm_page_pagequeue_lockptr(m),
+ ("%s: page %p migrated directly between queues", __func__, m));
+ vm_page_dequeue_locked(m);
+ mtx_unlock(lock);
}
/*
- * vm_page_enqueue:
+ * vm_page_enqueue_lazy:
*
- * Add the given page to the specified page queue.
+ * Schedule the given page for insertion into the specified page queue.
+ * Physical insertion of the page may be deferred indefinitely.
*
* The page must be locked.
*/
static void
-vm_page_enqueue(uint8_t queue, vm_page_t m)
+vm_page_enqueue_lazy(vm_page_t m, uint8_t queue)
{
+ struct vm_batchqueue *bq;
struct vm_pagequeue *pq;
+ int domain;
- vm_page_lock_assert(m, MA_OWNED);
- KASSERT(queue < PQ_COUNT,
- ("vm_page_enqueue: invalid queue %u request for page %p",
- queue, m));
+ vm_page_assert_locked(m);
+ KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0,
+ ("%s: page %p is already enqueued", __func__, m));
+
+ domain = vm_phys_domain(m);
pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue];
- vm_pagequeue_lock(pq);
+
+ /*
+ * The queue field might be changed back to PQ_NONE by a concurrent
+ * call to vm_page_dequeue(). In that case the batch queue entry will
+ * be a no-op.
+ */
m->queue = queue;
- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
- vm_pagequeue_cnt_inc(pq);
+
+ critical_enter();
+ bq = DPCPU_PTR(pqbatch[domain][queue]);
+ if (__predict_true(vm_batchqueue_insert(bq, m))) {
+ critical_exit();
+ return;
+ }
+ if (!vm_pagequeue_trylock(pq)) {
+ critical_exit();
+ vm_pagequeue_lock(pq);
+ critical_enter();
+ bq = DPCPU_PTR(pqbatch[domain][queue]);
+ }
+ vm_pqbatch_process(pq, bq, queue);
+ (void)vm_batchqueue_insert(bq, m);
+ vm_pqbatch_process(pq, bq, queue);
vm_pagequeue_unlock(pq);
+ critical_exit();
}
/*
* vm_page_requeue:
*
- * Move the given page to the tail of its current page queue.
+ * Schedule a requeue of the given page.
*
* The page must be locked.
*/
void
vm_page_requeue(vm_page_t m)
{
+ struct vm_batchqueue *bq;
struct vm_pagequeue *pq;
+ int domain, queue;
vm_page_lock_assert(m, MA_OWNED);
KASSERT(m->queue != PQ_NONE,
- ("vm_page_requeue: page %p is not queued", m));
+ ("%s: page %p is not enqueued", __func__, m));
+
+ domain = vm_phys_domain(m);
+ queue = m->queue;
pq = vm_page_pagequeue(m);
- vm_pagequeue_lock(pq);
- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
- vm_pagequeue_unlock(pq);
-}
-/*
- * vm_page_requeue_locked:
- *
- * Move the given page to the tail of its current page queue.
- *
- * The page queue must be locked.
- */
-void
-vm_page_requeue_locked(vm_page_t m)
-{
- struct vm_pagequeue *pq;
+ if (queue == PQ_NONE)
+ return;
- KASSERT(m->queue != PQ_NONE,
- ("vm_page_requeue_locked: page %p is not queued", m));
- pq = vm_page_pagequeue(m);
- vm_pagequeue_assert_locked(pq);
- TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_page_aflag_set(m, PGA_REQUEUE);
+ critical_enter();
+ bq = DPCPU_PTR(pqbatch[domain][queue]);
+ if (__predict_true(vm_batchqueue_insert(bq, m))) {
+ critical_exit();
+ return;
+ }
+ if (!vm_pagequeue_trylock(pq)) {
+ critical_exit();
+ vm_pagequeue_lock(pq);
+ critical_enter();
+ bq = DPCPU_PTR(pqbatch[domain][queue]);
+ }
+ vm_pqbatch_process(pq, bq, queue);
+ KASSERT(m->queue == queue || m->queue == PQ_NONE,
+ ("%s: page %p migrated between queues", __func__, m));
+ if (m->queue == queue) {
+ (void)vm_batchqueue_insert(bq, m);
+ vm_pqbatch_process(pq, bq, queue);
+ } else
+ vm_page_aflag_clear(m, PGA_REQUEUE);
+ vm_pagequeue_unlock(pq);
+ critical_exit();
}
/*
@@ -3078,18 +3335,18 @@
int queue;
vm_page_lock_assert(m, MA_OWNED);
- if ((queue = m->queue) != PQ_ACTIVE) {
- if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
- if (m->act_count < ACT_INIT)
- m->act_count = ACT_INIT;
- if (queue != PQ_NONE)
- vm_page_dequeue(m);
- vm_page_enqueue(PQ_ACTIVE, m);
- }
- } else {
- if (m->act_count < ACT_INIT)
+
+ if ((queue = m->queue) == PQ_ACTIVE || m->wire_count > 0 ||
+ (m->oflags & VPO_UNMANAGED) != 0) {
+ if (queue == PQ_ACTIVE && m->act_count < ACT_INIT)
m->act_count = ACT_INIT;
+ return;
}
+
+ vm_page_remque(m);
+ if (m->act_count < ACT_INIT)
+ m->act_count = ACT_INIT;
+ vm_page_enqueue_lazy(m, PQ_ACTIVE);
}
/*
@@ -3100,11 +3357,10 @@
* the page to the free list only if this function returns true.
*
* The object must be locked. The page must be locked if it is
- * managed. For a queued managed page, the pagequeue_locked
- * argument specifies whether the page queue is already locked.
+ * managed.
*/
bool
-vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
+vm_page_free_prep(vm_page_t m)
{
#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
@@ -3120,14 +3376,14 @@
if ((m->oflags & VPO_UNMANAGED) == 0) {
vm_page_lock_assert(m, MA_OWNED);
KASSERT(!pmap_page_is_mapped(m),
- ("vm_page_free_toq: freeing mapped page %p", m));
+ ("vm_page_free_prep: freeing mapped page %p", m));
} else
KASSERT(m->queue == PQ_NONE,
- ("vm_page_free_toq: unmanaged page %p is queued", m));
+ ("vm_page_free_prep: unmanaged page %p is queued", m));
VM_CNT_INC(v_tfree);
if (vm_page_sbusied(m))
- panic("vm_page_free: freeing busy page %p", m);
+ panic("vm_page_free_prep: freeing busy page %p", m);
vm_page_remove(m);
@@ -3143,21 +3399,23 @@
return (false);
}
- if (m->queue != PQ_NONE) {
- if (pagequeue_locked)
- vm_page_dequeue_locked(m);
- else
- vm_page_dequeue(m);
- }
+ /*
+ * Pages need not be dequeued before they are returned to the physical
+ * memory allocator, but they must at least be marked for a deferred
+ * dequeue.
+ */
+ if ((m->oflags & VPO_UNMANAGED) == 0)
+ vm_page_dequeue_lazy(m);
+
m->valid = 0;
vm_page_undirty(m);
if (m->wire_count != 0)
- panic("vm_page_free: freeing wired page %p", m);
+ panic("vm_page_free_prep: freeing wired page %p", m);
if (m->hold_count != 0) {
m->flags &= ~PG_ZERO;
KASSERT((m->flags & PG_UNHOLDFREE) == 0,
- ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
+ ("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m));
m->flags |= PG_UNHOLDFREE;
return (false);
}
@@ -3176,36 +3434,6 @@
return (true);
}
-void
-vm_page_free_phys_pglist(struct pglist *tq)
-{
- struct vm_domain *vmd;
- vm_page_t m;
- int cnt;
-
- if (TAILQ_EMPTY(tq))
- return;
- vmd = NULL;
- cnt = 0;
- TAILQ_FOREACH(m, tq, listq) {
- if (vmd != vm_pagequeue_domain(m)) {
- if (vmd != NULL) {
- vm_domain_free_unlock(vmd);
- vm_domain_freecnt_inc(vmd, cnt);
- cnt = 0;
- }
- vmd = vm_pagequeue_domain(m);
- vm_domain_free_lock(vmd);
- }
- vm_phys_free_pages(m, 0);
- cnt++;
- }
- if (vmd != NULL) {
- vm_domain_free_unlock(vmd);
- vm_domain_freecnt_inc(vmd, cnt);
- }
-}
-
/*
* vm_page_free_toq:
*
@@ -3220,8 +3448,9 @@
{
struct vm_domain *vmd;
- if (!vm_page_free_prep(m, false))
+ if (!vm_page_free_prep(m))
return;
+
vmd = vm_pagequeue_domain(m);
vm_domain_free_lock(vmd);
vm_phys_free_pages(m, 0);
@@ -3243,23 +3472,18 @@
vm_page_free_pages_toq(struct spglist *free, bool update_wire_count)
{
vm_page_t m;
- struct pglist pgl;
int count;
if (SLIST_EMPTY(free))
return;
count = 0;
- TAILQ_INIT(&pgl);
while ((m = SLIST_FIRST(free)) != NULL) {
count++;
SLIST_REMOVE_HEAD(free, plinks.s.ss);
- if (vm_page_free_prep(m, false))
- TAILQ_INSERT_TAIL(&pgl, m, listq);
+ vm_page_free_toq(m);
}
- vm_page_free_phys_pglist(&pgl);
-
if (update_wire_count)
vm_wire_sub(count);
}
@@ -3318,22 +3542,25 @@
KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
("vm_page_unwire: invalid queue %u request for page %p",
queue, m));
+ if ((m->oflags & VPO_UNMANAGED) == 0)
+ vm_page_assert_locked(m);
unwired = vm_page_unwire_noq(m);
- if (unwired && (m->oflags & VPO_UNMANAGED) == 0 && m->object != NULL) {
- if (m->queue == queue) {
+ if (!unwired || (m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL)
+ return (unwired);
+
+ if (m->queue == queue) {
+ if (queue == PQ_ACTIVE)
+ vm_page_reference(m);
+ else if (queue != PQ_NONE)
+ vm_page_requeue(m);
+ } else {
+ vm_page_dequeue(m);
+ if (queue != PQ_NONE) {
+ vm_page_enqueue_lazy(m, queue);
if (queue == PQ_ACTIVE)
- vm_page_reference(m);
- else if (queue != PQ_NONE)
- vm_page_requeue(m);
- } else {
- vm_page_remque(m);
- if (queue != PQ_NONE) {
- vm_page_enqueue(queue, m);
- if (queue == PQ_ACTIVE)
- /* Initialize act_count. */
- vm_page_activate(m);
- }
+ /* Initialize act_count. */
+ vm_page_activate(m);
}
}
return (unwired);
@@ -3369,73 +3596,85 @@
}
/*
- * Move the specified page to the inactive queue, or requeue the page if it is
- * already in the inactive queue.
- *
- * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
- * queue. However, setting "noreuse" to TRUE will accelerate the specified
- * page's reclamation, but it will not unmap the page from any address space.
- * This is implemented by inserting the page near the head of the inactive
- * queue, using a marker page to guide FIFO insertion ordering.
+ * Move the specified page to the tail of the inactive queue, or requeue
+ * the page if it is already in the inactive queue.
*
* The page must be locked.
*/
-static inline void
-_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
+void
+vm_page_deactivate(vm_page_t m)
{
- struct vm_pagequeue *pq;
- int queue;
vm_page_assert_locked(m);
- if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
- pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE];
- /* Avoid multiple acquisitions of the inactive queue lock. */
- queue = m->queue;
- if (queue == PQ_INACTIVE) {
- vm_pagequeue_lock(pq);
- vm_page_dequeue_locked(m);
- } else {
- if (queue != PQ_NONE)
- vm_page_dequeue(m);
- vm_pagequeue_lock(pq);
- }
- m->queue = PQ_INACTIVE;
- if (noreuse)
- TAILQ_INSERT_BEFORE(
- &vm_pagequeue_domain(m)->vmd_inacthead, m,
- plinks.q);
- else
- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
- vm_pagequeue_cnt_inc(pq);
- vm_pagequeue_unlock(pq);
- }
-}
-
-/*
- * Move the specified page to the inactive queue, or requeue the page if it is
- * already in the inactive queue.
- *
- * The page must be locked.
- */
-void
-vm_page_deactivate(vm_page_t m)
-{
+ if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
+ return;
- _vm_page_deactivate(m, FALSE);
+ if (!vm_page_inactive(m)) {
+ vm_page_remque(m);
+ vm_page_enqueue_lazy(m, PQ_INACTIVE);
+ } else
+ vm_page_requeue(m);
}
/*
- * Move the specified page to the inactive queue with the expectation
- * that it is unlikely to be reused.
+ * Move the specified page close to the head of the inactive queue,
+ * bypassing LRU. A marker page is used to maintain FIFO ordering.
+ * As with regular enqueues, we use a per-CPU batch queue to reduce
+ * contention on the page queue lock.
*
* The page must be locked.
*/
void
vm_page_deactivate_noreuse(vm_page_t m)
{
+ struct vm_batchqueue *bq;
+ struct vm_domain *vmd;
+ struct vm_pagequeue *pq;
+ vm_page_t marker;
+ int domain;
+
+ vm_page_assert_locked(m);
+
+ if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
+ return;
+
+ domain = vm_phys_domain(m);
+ vmd = VM_DOMAIN(domain);
+ pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
+
+ if (!vm_page_inactive(m))
+ vm_page_remque(m);
- _vm_page_deactivate(m, TRUE);
+ m->queue = PQ_INACTIVE;
+
+ critical_enter();
+ bq = DPCPU_PTR(noreuseq[domain]);
+ if (vm_batchqueue_insert(bq, m)) {
+ critical_exit();
+ return;
+ }
+ if (!vm_pagequeue_trylock(pq)) {
+ critical_exit();
+ vm_pagequeue_lock(pq);
+ critical_enter();
+ bq = DPCPU_PTR(noreuseq[domain]);
+ }
+ marker = &vmd->vmd_inacthead;
+ do {
+ if (m->queue != PQ_INACTIVE)
+ continue;
+ if ((m->aflags & PGA_ENQUEUED) != 0)
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ else {
+ vm_page_aflag_set(m, PGA_ENQUEUED);
+ vm_pagequeue_cnt_inc(pq);
+ }
+ TAILQ_INSERT_BEFORE(marker, m, plinks.q);
+ } while ((m = vm_batchqueue_pop(bq)) != NULL);
+ vm_pagequeue_unlock(pq);
+ vm_batchqueue_init(bq);
+ critical_exit();
}
/*
@@ -3448,13 +3687,14 @@
{
vm_page_assert_locked(m);
- if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
- if (m->queue == PQ_LAUNDRY)
- vm_page_requeue(m);
- else {
- vm_page_remque(m);
- vm_page_enqueue(PQ_LAUNDRY, m);
- }
+ if (m->wire_count > 0 || (m->oflags & VPO_UNMANAGED) != 0)
+ return;
+
+ if (m->queue == PQ_LAUNDRY)
+ vm_page_requeue(m);
+ else {
+ vm_page_remque(m);
+ vm_page_enqueue_lazy(m, PQ_LAUNDRY);
}
}
@@ -3470,9 +3710,9 @@
vm_page_assert_locked(m);
KASSERT(m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0,
("page %p already unswappable", m));
- if (m->queue != PQ_NONE)
- vm_page_dequeue(m);
- vm_page_enqueue(PQ_UNSWAPPABLE, m);
+
+ vm_page_remque(m);
+ vm_page_enqueue_lazy(m, PQ_UNSWAPPABLE);
}
/*
Index: sys/vm/vm_pageout.c
===================================================================
--- sys/vm/vm_pageout.c
+++ sys/vm/vm_pageout.c
@@ -201,11 +201,17 @@
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
static u_int isqrt(u_int num);
-static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
static int vm_pageout_launder(struct vm_domain *vmd, int launder,
bool in_shortfall);
static void vm_pageout_laundry_worker(void *arg);
-static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
+
+struct scan_state {
+ struct vm_batchqueue bq;
+ struct vm_pagequeue *pq;
+ vm_page_t marker;
+ int maxscan;
+ int scanned;
+};
/*
* Initialize a dummy page for marking the caller's place in the specified
@@ -225,96 +231,157 @@
}
/*
- * vm_pageout_fallback_object_lock:
- *
- * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
- * known to have failed and page queue must be either PQ_ACTIVE or
- * PQ_INACTIVE. To avoid lock order violation, unlock the page queue
- * while locking the vm object. Use marker page to detect page queue
- * changes and maintain notion of next page on page queue. Return
- * TRUE if no changes were detected, FALSE otherwise. vm object is
- * locked on return.
- *
- * This function depends on both the lock portion of struct vm_object
- * and normal struct vm_page being type stable.
+ * Initialize and enqueue static queue markers.
*/
-static boolean_t
-vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
+static void
+vm_pageout_insert_markers(struct vm_domain *vmd)
{
- struct vm_page marker;
- struct vm_pagequeue *pq;
- boolean_t unchanged;
- u_short queue;
- vm_object_t object;
+ vm_page_t marker;
+ int i;
- queue = m->queue;
- vm_pageout_init_marker(&marker, queue);
- pq = vm_page_pagequeue(m);
- object = m->object;
-
- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
- vm_pagequeue_unlock(pq);
- vm_page_unlock(m);
- VM_OBJECT_WLOCK(object);
- vm_page_lock(m);
- vm_pagequeue_lock(pq);
+ /*
+ * inacthead is used to provide FIFO ordering for LRU-bypassing
+ * insertions.
+ */
+ marker = &vmd->vmd_inacthead;
+ vm_pageout_init_marker(marker, PQ_INACTIVE);
+ TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, marker,
+ plinks.q);
+ vm_page_aflag_set(marker, PGA_ENQUEUED);
/*
- * The page's object might have changed, and/or the page might
- * have moved from its original position in the queue. If the
- * page's object has changed, then the caller should abandon
- * processing the page because the wrong object lock was
- * acquired. Use the marker's plinks.q, not the page's, to
- * determine if the page has been moved. The state of the
- * page's plinks.q can be indeterminate; whereas, the marker's
- * plinks.q must be valid.
+ * The clock pages are used to implement active queue scanning without
+ * requeues. Scans start at clock[0], which is advanced after the scan
+ * ends. When the two clock hands meet, they are reset and scanning
+ * resumes from the head of the queue.
*/
- *next = TAILQ_NEXT(&marker, plinks.q);
- unchanged = m->object == object &&
- m == TAILQ_PREV(&marker, pglist, plinks.q);
- KASSERT(!unchanged || m->queue == queue,
- ("page %p queue %d %d", m, queue, m->queue));
- TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
- return (unchanged);
+ marker = &vmd->vmd_clock[0];
+ vm_pageout_init_marker(marker, PQ_ACTIVE);
+ TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, marker,
+ plinks.q);
+ vm_page_aflag_set(marker, PGA_ENQUEUED);
+ marker = &vmd->vmd_clock[1];
+ vm_pageout_init_marker(marker, PQ_ACTIVE);
+ TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, marker,
+ plinks.q);
+ vm_page_aflag_set(marker, PGA_ENQUEUED);
+
+ for (i = 0; i < PQ_COUNT; i++)
+ vm_pageout_init_marker(&vmd->vmd_markers[i], i);
+}
+
+static void
+vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
+ vm_page_t marker, int maxscan)
+{
+
+ vm_pagequeue_assert_locked(pq);
+
+ if ((marker->aflags & PGA_ENQUEUED) == 0) {
+ TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
+ vm_page_aflag_set(marker, PGA_ENQUEUED);
+ }
+ vm_batchqueue_init(&ss->bq);
+ ss->pq = pq;
+ ss->marker = marker;
+ ss->maxscan = maxscan;
+ ss->scanned = 0;
+ vm_pagequeue_unlock(pq);
+}
+
+static void
+vm_pageout_end_scan(struct scan_state *ss)
+{
+ struct vm_pagequeue *pq;
+
+ pq = ss->pq;
+ vm_pagequeue_assert_locked(pq);
+ KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
+ ("marker %p not enqueued", ss->marker));
+
+ if ((ss->marker->aflags & PGA_ENQUEUED) != 0) {
+ TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
+ vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
+ }
+ VM_CNT_ADD(v_pdpages, ss->scanned);
+}
+
+static inline bool
+vm_pageout_page_queued(vm_page_t m, int queue)
+{
+
+ vm_page_assert_locked(m);
+
+ if ((m->aflags & PGA_DEQUEUE) != 0)
+ return (false);
+ atomic_thread_fence_acq();
+ return (m->queue == queue);
}
/*
- * Lock the page while holding the page queue lock. Use marker page
- * to detect page queue changes and maintain notion of next page on
- * page queue. Return TRUE if no changes were detected, FALSE
- * otherwise. The page is locked on return. The page queue lock might
- * be dropped and reacquired.
+ * Add a small number of queued pages to a batch queue for later processing
+ * without the corresponding queue lock held. The caller must have enqueued a
+ * marker page at the desired start point for the scan.
*
- * This function depends on normal struct vm_page being type stable.
+ * When processing the batch queue, vm_pageout_page_queued() must be used to
+ * determine whether the page was logically dequeued by another thread. Once
+ * this check is performed, the page lock guarantees that the page will not be
+ * disassociated from the queue.
*/
-static boolean_t
-vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
+static inline void
+vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
{
- struct vm_page marker;
struct vm_pagequeue *pq;
- boolean_t unchanged;
- u_short queue;
+ vm_page_t m, marker;
- vm_page_lock_assert(m, MA_NOTOWNED);
- if (vm_page_trylock(m))
- return (TRUE);
+ KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
+ ("marker %p not enqueued", ss->marker));
- queue = m->queue;
- vm_pageout_init_marker(&marker, queue);
- pq = vm_page_pagequeue(m);
+ marker = ss->marker;
+ pq = ss->pq;
- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
- vm_pagequeue_unlock(pq);
- vm_page_lock(m);
vm_pagequeue_lock(pq);
+ for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
+ ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
+ m = TAILQ_NEXT(m, plinks.q), ss->scanned++) {
+ if ((m->flags & PG_MARKER) == 0) {
+ KASSERT((m->aflags & PGA_ENQUEUED) != 0,
+ ("page %p not enqueued", m));
+ KASSERT((m->flags & PG_FICTITIOUS) == 0,
+ ("Fictitious page %p cannot be in page queue", m));
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("Unmanaged page %p cannot be in page queue", m));
+ } else if (dequeue)
+ continue;
+
+ (void)vm_batchqueue_insert(&ss->bq, m);
+ if (dequeue) {
+ TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+ vm_page_aflag_clear(m, PGA_ENQUEUED);
+ }
+ }
+ TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
+ if (__predict_true(m != NULL))
+ TAILQ_INSERT_BEFORE(m, marker, plinks.q);
+ else
+ TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
+ if (dequeue)
+ vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
+ vm_pagequeue_unlock(pq);
+}
+
+/* XXX */
+static inline vm_page_t
+vm_pageout_next(struct scan_state *ss, const bool dequeue)
+{
+ vm_page_t m;
- /* Page queue might have changed. */
- *next = TAILQ_NEXT(&marker, plinks.q);
- unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
- KASSERT(!unchanged || m->queue == queue,
- ("page %p queue %d %d", m, queue, m->queue));
- TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
- return (unchanged);
+ m = vm_batchqueue_pop(&ss->bq);
+ if (m == NULL) {
+ vm_pageout_collect_batch(ss, dequeue);
+ m = vm_batchqueue_pop(&ss->bq);
+ }
+ return (m);
}
/*
@@ -370,12 +437,12 @@
break;
}
vm_page_test_dirty(p);
- if (p->dirty == 0) {
+ if (p->dirty == 0 || !vm_page_in_laundry(p)) {
ib = 0;
break;
}
vm_page_lock(p);
- if (!vm_page_in_laundry(p) || vm_page_held(p)) {
+ if (vm_page_held(p)) {
vm_page_unlock(p);
ib = 0;
break;
@@ -398,10 +465,10 @@
if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
break;
vm_page_test_dirty(p);
- if (p->dirty == 0)
+ if (p->dirty == 0 || !vm_page_in_laundry(p))
break;
vm_page_lock(p);
- if (!vm_page_in_laundry(p) || vm_page_held(p)) {
+ if (vm_page_held(p)) {
vm_page_unlock(p);
break;
}
@@ -692,13 +759,18 @@
static int
vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
{
+ struct scan_state ss;
struct vm_pagequeue *pq;
+ struct mtx *mtx;
vm_object_t object;
- vm_page_t m, next;
- int act_delta, error, maxscan, numpagedout, starting_target;
+ vm_page_t m, marker;
+ int act_delta, error, numpagedout, queue, starting_target;
int vnodes_skipped;
- bool pageout_ok, queue_locked;
+ bool obj_locked, pageout_ok;
+ mtx = NULL;
+ obj_locked = false;
+ object = NULL;
starting_target = launder;
vnodes_skipped = 0;
@@ -711,61 +783,78 @@
* maxscan ensures that we don't re-examine requeued pages. Any
* additional pages written as part of a cluster are subtracted from
* maxscan since they must be taken from the laundry queue.
+ * XXX
*
* As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
* swap devices are configured.
*/
if (atomic_load_acq_int(&swapdev_enabled))
- pq = &vmd->vmd_pagequeues[PQ_UNSWAPPABLE];
+ queue = PQ_UNSWAPPABLE;
else
- pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
+ queue = PQ_LAUNDRY;
+ marker = &vmd->vmd_markers[queue];
+ pq = &vmd->vmd_pagequeues[queue];
scan:
vm_pagequeue_lock(pq);
- maxscan = pq->pq_cnt;
- queue_locked = true;
- for (m = TAILQ_FIRST(&pq->pq_pl);
- m != NULL && maxscan-- > 0 && launder > 0;
- m = next) {
- vm_pagequeue_assert_locked(pq);
- KASSERT(queue_locked, ("unlocked laundry queue"));
- KASSERT(vm_page_in_laundry(m),
- ("page %p has an inconsistent queue", m));
- next = TAILQ_NEXT(m, plinks.q);
- if ((m->flags & PG_MARKER) != 0)
+ vm_pageout_init_scan(&ss, pq, marker, pq->pq_cnt);
+ while ((m = vm_pageout_next(&ss, false)) != NULL) {
+ if (__predict_false((m->flags & PG_MARKER) != 0))
continue;
- KASSERT((m->flags & PG_FICTITIOUS) == 0,
- ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
- ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
- if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
- vm_page_unlock(m);
+
+ vm_page_change_lock(m, &mtx);
+
+recheck:
+ /*
+ * The page may have been disassociated from the queue
+ * while locks were dropped.
+ */
+ if (!vm_pageout_page_queued(m, queue))
+ continue;
+
+ /*
+ * A requeue was requested, so this page gets a second
+ * chance.
+ */
+ if ((m->aflags & PGA_REQUEUE) != 0) {
+ vm_page_requeue(m);
continue;
}
+
+ /*
+ * Held pages are essentially stuck in the queue.
+ *
+ * Wired pages may not be freed. Complete their removal
+ * from the queue now to avoid needless revisits during
+ * future scans.
+ */
+ if (m->hold_count != 0)
+ continue;
if (m->wire_count != 0) {
- vm_page_dequeue_locked(m);
- vm_page_unlock(m);
+ vm_page_dequeue_lazy(m);
continue;
}
- object = m->object;
- if ((!VM_OBJECT_TRYWLOCK(object) &&
- (!vm_pageout_fallback_object_lock(m, &next) ||
- vm_page_held(m))) || vm_page_busied(m)) {
- VM_OBJECT_WUNLOCK(object);
- if (m->wire_count != 0 && vm_page_pagequeue(m) == pq)
- vm_page_dequeue_locked(m);
- vm_page_unlock(m);
- continue;
+
+ if (object != m->object) {
+ if (obj_locked) {
+ VM_OBJECT_WUNLOCK(object);
+ obj_locked = false;
+ }
+ object = m->object;
+ }
+ if (!obj_locked) {
+ if (!VM_OBJECT_TRYWLOCK(object)) {
+ mtx_unlock(mtx);
+ VM_OBJECT_WLOCK(object);
+ obj_locked = true;
+ mtx_lock(mtx);
+ goto recheck;
+ } else
+ obj_locked = true;
}
- /*
- * Unlock the laundry queue, invalidating the 'next' pointer.
- * Use a marker to remember our place in the laundry queue.
- */
- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
- plinks.q);
- vm_pagequeue_unlock(pq);
- queue_locked = false;
+ if (vm_page_busied(m))
+ continue;
/*
* Invalid pages can be easily freed. They cannot be
@@ -814,9 +903,11 @@
*/
if (!in_shortfall)
launder--;
- goto drop_page;
- } else if ((object->flags & OBJ_DEAD) == 0)
- goto requeue_page;
+ continue;
+ } else if ((object->flags & OBJ_DEAD) == 0) {
+ vm_page_requeue(m);
+ continue;
+ }
}
/*
@@ -851,11 +942,8 @@
else
pageout_ok = true;
if (!pageout_ok) {
-requeue_page:
- vm_pagequeue_lock(pq);
- queue_locked = true;
- vm_page_requeue_locked(m);
- goto drop_page;
+ vm_page_requeue(m);
+ continue;
}
/*
@@ -874,28 +962,31 @@
error = vm_pageout_clean(m, &numpagedout);
if (error == 0) {
launder -= numpagedout;
- maxscan -= numpagedout - 1;
+ ss.scanned += numpagedout;
} else if (error == EDEADLK) {
pageout_lock_miss++;
vnodes_skipped++;
}
- goto relock_queue;
+ mtx = NULL;
+ obj_locked = false;
}
-drop_page:
- vm_page_unlock(m);
+ }
+ if (mtx != NULL) {
+ mtx_unlock(mtx);
+ mtx = NULL;
+ }
+ if (obj_locked) {
VM_OBJECT_WUNLOCK(object);
-relock_queue:
- if (!queue_locked) {
- vm_pagequeue_lock(pq);
- queue_locked = true;
- }
- next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
- TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
+ obj_locked = false;
}
+ vm_pagequeue_lock(pq);
+ vm_pageout_end_scan(&ss);
vm_pagequeue_unlock(pq);
if (launder > 0 && pq == &vmd->vmd_pagequeues[PQ_UNSWAPPABLE]) {
- pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
+ queue = PQ_LAUNDRY;
+ marker = &vmd->vmd_markers[queue];
+ pq = &vmd->vmd_pagequeues[queue];
goto scan;
}
@@ -951,7 +1042,6 @@
vmd = VM_DOMAIN(domain);
pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
- vm_pageout_init_marker(&vmd->vmd_laundry_marker, PQ_LAUNDRY);
shortfall = 0;
in_shortfall = false;
@@ -1091,6 +1181,68 @@
}
}
+static int
+vm_pageout_free_pages(vm_object_t object, vm_page_t m, struct mtx **mtxp)
+{
+ vm_page_t p, pp;
+ vm_pindex_t start;
+ int pcount, count;
+
+ pcount = MAX(object->iosize / PAGE_SIZE, 1);
+ count = 1;
+ if (pcount == 1) {
+ vm_page_free(m);
+ goto out;
+ }
+
+ /* Find the first page in the block. */
+ start = m->pindex - (m->pindex % pcount);
+ for (p = m; p->pindex > start && (pp = vm_page_prev(p)) != NULL;
+ p = pp);
+
+ /* Free the original page so we don't validate it twice. */
+ if (p == m)
+ p = vm_page_next(m);
+ vm_page_free(m);
+ /* Iterate through the block range and free compatible pages. */
+ for (m = p; m != NULL; m = p) {
+ /* Don't cache miss for the next page after the tail. */
+ if (m->pindex < start + pcount)
+ p = TAILQ_NEXT(m, listq);
+ else
+ p = NULL;
+ vm_page_change_lock(m, mtxp);
+ if (vm_page_held(m) || vm_page_busied(m) ||
+ m->queue != PQ_INACTIVE)
+ continue;
+ if (m->valid == 0)
+ goto free_page;
+ if ((m->aflags & PGA_REFERENCED) != 0)
+ continue;
+ if (object->ref_count != 0) {
+ if (pmap_ts_referenced(m)) {
+ vm_page_aflag_set(m, PGA_REFERENCED);
+ continue;
+ }
+ vm_page_test_dirty(m);
+ if (m->dirty == 0)
+ pmap_remove_all(m);
+ }
+ if (m->dirty) {
+ if ((object->flags & OBJ_DEAD) == 0)
+ vm_page_launder(m);
+ continue;
+ }
+free_page:
+ vm_page_free(m);
+ count++;
+ }
+out:
+ VM_CNT_ADD(v_dfree, count);
+
+ return (count);
+}
+
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*
@@ -1103,13 +1255,16 @@
static bool
vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
{
- vm_page_t m, next;
+ struct scan_state ss;
+ struct vm_batchqueue rq;
+ struct mtx *mtx;
+ vm_page_t m, marker;
struct vm_pagequeue *pq;
vm_object_t object;
long min_scan;
- int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
- int page_shortage, scan_tick, scanned, starting_page_shortage;
- boolean_t queue_locked;
+ int act_delta, addl_page_shortage, deficit, inactq_shortage;
+ int page_shortage, scan_tick, starting_page_shortage;
+ bool obj_locked;
/*
* If we need to reclaim memory ask kernel caches to return
@@ -1150,78 +1305,82 @@
page_shortage = deficit = 0;
starting_page_shortage = page_shortage;
+ mtx = NULL;
+ obj_locked = false;
+ object = NULL;
+ vm_batchqueue_init(&rq);
+
/*
* Start scanning the inactive queue for pages that we can free. The
* scan will stop when we reach the target or we have scanned the
* entire queue. (Note that m->act_count is not used to make
* decisions for the inactive queue, only for the active queue.)
*/
+ marker = &vmd->vmd_markers[PQ_INACTIVE];
pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
- maxscan = pq->pq_cnt;
vm_pagequeue_lock(pq);
- queue_locked = TRUE;
- for (m = TAILQ_FIRST(&pq->pq_pl);
- m != NULL && maxscan-- > 0 && page_shortage > 0;
- m = next) {
- vm_pagequeue_assert_locked(pq);
- KASSERT(queue_locked, ("unlocked inactive queue"));
- KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
+ vm_pageout_init_scan(&ss, pq, marker, min(pq->pq_cnt, page_shortage));
+ while ((m = vm_pageout_next(&ss, true)) != NULL) {
+ if (__predict_false((m->flags & PG_MARKER) != 0))
+ continue;
- VM_CNT_INC(v_pdpages);
- next = TAILQ_NEXT(m, plinks.q);
+ vm_page_change_lock(m, &mtx);
+recheck:
/*
- * skip marker pages
+ * The page may have been disassociated from the queue
+ * while locks were dropped.
*/
- if (m->flags & PG_MARKER)
+ if (!vm_pageout_page_queued(m, PQ_INACTIVE)) {
+ addl_page_shortage++;
continue;
+ }
- KASSERT((m->flags & PG_FICTITIOUS) == 0,
- ("Fictitious page %p cannot be in inactive queue", m));
- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
- ("Unmanaged page %p cannot be in inactive queue", m));
+ /*
+ * A requeue was requested, so this page gets a second
+ * chance.
+ */
+ if ((m->aflags & PGA_REQUEUE) != 0)
+ goto reinsert;
/*
- * The page or object lock acquisitions fail if the
- * page was removed from the queue or moved to a
- * different position within the queue. In either
- * case, addl_page_shortage should not be incremented.
+ * Held pages are essentially stuck in the queue. So,
+ * they ought to be discounted from the inactive count.
+ * See the calculation of inactq_shortage before the
+ * loop over the active queue below.
+ *
+ * Wired pages may not be freed. Complete their removal
+ * from the queue now to avoid needless revisits during
+ * future scans.
*/
- if (!vm_pageout_page_lock(m, &next))
- goto unlock_page;
- else if (m->wire_count != 0) {
- /*
- * Wired pages may not be freed, and unwiring a queued
- * page will cause it to be requeued. Thus, remove them
- * from the queue now to avoid unnecessary revisits.
- */
- vm_page_dequeue_locked(m);
+ if (m->hold_count != 0) {
addl_page_shortage++;
- goto unlock_page;
- } else if (m->hold_count != 0) {
- /*
- * Held pages are essentially stuck in the
- * queue. So, they ought to be discounted
- * from the inactive count. See the
- * calculation of inactq_shortage before the
- * loop over the active queue below.
- */
+ goto reinsert;
+ }
+ if (m->wire_count != 0) {
addl_page_shortage++;
- goto unlock_page;
+ vm_page_dequeue_lazy(m);
+ continue;
}
- object = m->object;
- if (!VM_OBJECT_TRYWLOCK(object)) {
- if (!vm_pageout_fallback_object_lock(m, &next))
- goto unlock_object;
- else if (m->wire_count != 0) {
- vm_page_dequeue_locked(m);
- addl_page_shortage++;
- goto unlock_object;
- } else if (m->hold_count != 0) {
- addl_page_shortage++;
- goto unlock_object;
+
+ if (object != m->object) {
+ if (obj_locked) {
+ VM_OBJECT_WUNLOCK(object);
+ obj_locked = false;
}
+ object = m->object;
}
+ if (!obj_locked) {
+ if (!VM_OBJECT_TRYWLOCK(object)) {
+ mtx_unlock(mtx);
+ VM_OBJECT_WLOCK(object);
+ obj_locked = true;
+ mtx_lock(mtx);
+ goto recheck;
+ } else
+ obj_locked = true;
+ }
+
if (vm_page_busied(m)) {
/*
* Don't mess with busy pages. Leave them at
@@ -1232,26 +1391,8 @@
* inactive count.
*/
addl_page_shortage++;
-unlock_object:
- VM_OBJECT_WUNLOCK(object);
-unlock_page:
- vm_page_unlock(m);
- continue;
+ goto reinsert;
}
- KASSERT(!vm_page_held(m), ("Held page %p", m));
-
- /*
- * Dequeue the inactive page and unlock the inactive page
- * queue, invalidating the 'next' pointer. Dequeueing the
- * page here avoids a later reacquisition (and release) of
- * the inactive page queue lock when vm_page_activate(),
- * vm_page_free(), or vm_page_launder() is called. Use a
- * marker to remember our place in the inactive queue.
- */
- TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
- vm_page_dequeue_locked(m);
- vm_pagequeue_unlock(pq);
- queue_locked = FALSE;
/*
* Invalid pages can be easily freed. They cannot be
@@ -1289,14 +1430,10 @@
* queue.
*/
m->act_count += act_delta + ACT_ADVANCE;
- goto drop_page;
+ continue;
} else if ((object->flags & OBJ_DEAD) == 0) {
- vm_pagequeue_lock(pq);
- queue_locked = TRUE;
- m->queue = PQ_INACTIVE;
- TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
- vm_pagequeue_cnt_inc(pq);
- goto drop_page;
+ vm_page_aflag_set(m, PGA_REQUEUE);
+ goto reinsert;
}
}
@@ -1322,21 +1459,49 @@
*/
if (m->dirty == 0) {
free_page:
- vm_page_free(m);
- VM_CNT_INC(v_dfree);
- --page_shortage;
+ page_shortage -= vm_pageout_free_pages(object,
+ m, &mtx);
} else if ((object->flags & OBJ_DEAD) == 0)
vm_page_launder(m);
-drop_page:
- vm_page_unlock(m);
- VM_OBJECT_WUNLOCK(object);
- if (!queue_locked) {
+ continue;
+
+ /*
+ * Re-add stuck pages to the queue. We will examine them again
+ * during the next scan. If the queue state of a page has
+ * changed since it was physically removed from the page queue,
+ * don't do anything with that page.
+ */
+reinsert:
+ if (!vm_batchqueue_insert(&rq, m)) {
vm_pagequeue_lock(pq);
- queue_locked = TRUE;
+ do {
+ if (!vm_page_inactive(m) ||
+ (m->aflags & PGA_ENQUEUED) != 0)
+ continue;
+ vm_page_aflag_set(m, PGA_ENQUEUED);
+ if ((m->aflags & PGA_REQUEUE) != 0) {
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m,
+ plinks.q);
+ vm_page_aflag_clear(m, PGA_REQUEUE);
+ } else
+ TAILQ_INSERT_BEFORE(ss.marker, m,
+ plinks.q);
+ vm_pagequeue_cnt_inc(pq);
+ } while ((m = vm_batchqueue_pop(&rq)) != NULL);
+ vm_pagequeue_unlock(pq);
+ vm_batchqueue_init(&rq);
}
- next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
- TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
}
+ if (mtx != NULL) {
+ mtx_unlock(mtx);
+ mtx = NULL;
+ }
+ if (obj_locked) {
+ VM_OBJECT_WUNLOCK(object);
+ obj_locked = false;
+ }
+ vm_pagequeue_lock(pq);
+ vm_pageout_end_scan(&ss);
vm_pagequeue_unlock(pq);
/*
@@ -1399,9 +1564,9 @@
vm_paging_target(vmd) + deficit + addl_page_shortage;
inactq_shortage *= act_scan_laundry_weight;
+ marker = &vmd->vmd_markers[PQ_ACTIVE];
pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
vm_pagequeue_lock(pq);
- maxscan = pq->pq_cnt;
/*
* If we're just idle polling attempt to visit every
@@ -1414,7 +1579,7 @@
min_scan /= hz * vm_pageout_update_period;
} else
min_scan = 0;
- if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
+ if (min_scan > 0 || (inactq_shortage > 0 && pq->pq_cnt > 0))
vmd->vmd_last_active_scan = scan_tick;
/*
@@ -1422,35 +1587,40 @@
* the per-page activity counter and use it to identify deactivation
* candidates. Held pages may be deactivated.
*/
- for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
- min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
- scanned++) {
- KASSERT(m->queue == PQ_ACTIVE,
- ("vm_pageout_scan: page %p isn't active", m));
- next = TAILQ_NEXT(m, plinks.q);
- if ((m->flags & PG_MARKER) != 0)
- continue;
- KASSERT((m->flags & PG_FICTITIOUS) == 0,
- ("Fictitious page %p cannot be in active queue", m));
- KASSERT((m->oflags & VPO_UNMANAGED) == 0,
- ("Unmanaged page %p cannot be in active queue", m));
- if (!vm_pageout_page_lock(m, &next)) {
- vm_page_unlock(m);
- continue;
+act_scan:
+ TAILQ_INSERT_AFTER(&pq->pq_pl, &vmd->vmd_clock[0], marker, plinks.q);
+ vm_page_aflag_set(marker, PGA_ENQUEUED);
+ vm_pageout_init_scan(&ss, pq, marker, inactq_shortage > 0 ?
+ pq->pq_cnt : min_scan);
+ while ((m = vm_pageout_next(&ss, false)) != NULL) {
+ if (__predict_false(m == &vmd->vmd_clock[1])) {
+ vm_pagequeue_lock(pq);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
+ TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
+ plinks.q);
+ TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
+ plinks.q);
+ vm_pageout_end_scan(&ss);
+ goto act_scan;
}
+ if (__predict_false((m->flags & PG_MARKER) != 0))
+ continue;
+
+ vm_page_change_lock(m, &mtx);
/*
- * The count for page daemon pages is updated after checking
- * the page for eligibility.
+ * The page may have been disassociated from the queue
+ * while locks were dropped.
*/
- VM_CNT_INC(v_pdpages);
+ if (!vm_pageout_page_queued(m, PQ_ACTIVE))
+ continue;
/*
* Wired pages are dequeued lazily.
*/
if (m->wire_count != 0) {
- vm_page_dequeue_locked(m);
- vm_page_unlock(m);
+ vm_page_dequeue_lazy(m);
continue;
}
@@ -1494,9 +1664,6 @@
* queue depending on usage.
*/
if (m->act_count == 0) {
- /* Dequeue to avoid later lock recursion. */
- vm_page_dequeue_locked(m);
-
/*
* When not short for inactive pages, let dirty pages go
* through the inactive queue before moving to the
@@ -1528,11 +1695,19 @@
inactq_shortage--;
}
}
- } else
- vm_page_requeue_locked(m);
- vm_page_unlock(m);
+ }
}
+ if (mtx != NULL) {
+ mtx_unlock(mtx);
+ mtx = NULL;
+ }
+
+ vm_pagequeue_lock(pq);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
+ TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
+ vm_pageout_end_scan(&ss);
vm_pagequeue_unlock(pq);
+
if (pass > 0)
vm_swapout_run_idle();
return (page_shortage <= 0);
@@ -1779,10 +1954,8 @@
KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
vmd->vmd_last_active_scan = ticks;
- vm_pageout_init_marker(&vmd->vmd_marker, PQ_INACTIVE);
- vm_pageout_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE);
- TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl,
- &vmd->vmd_inacthead, plinks.q);
+
+ vm_pageout_insert_markers(vmd);
/*
* The pageout daemon worker is never done, so loop forever.
Index: sys/vm/vm_pagequeue.h
===================================================================
--- sys/vm/vm_pagequeue.h
+++ sys/vm/vm_pagequeue.h
@@ -73,7 +73,18 @@
const char * const pq_name;
} __aligned(CACHE_LINE_SIZE);
+#ifndef VM_BATCHQUEUE_SIZE
+#define VM_BATCHQUEUE_SIZE 7
+#endif
+
+struct vm_batchqueue {
+ vm_page_t bq_pa[VM_BATCHQUEUE_SIZE];
+ int bq_cnt;
+} __aligned(CACHE_LINE_SIZE);
+
+#include <vm/uma.h>
#include <sys/pidctrl.h>
+
struct sysctl_oid;
/*
@@ -81,17 +92,18 @@
* and accounting.
*
* Lock Key:
- * f vmd_free_mtx
- * p vmd_pageout_mtx
- * d vm_domainset_lock
- * a atomic
- * c const after boot
- * q page queue lock
+ * f vmd_free_mtx
+ * p vmd_pageout_mtx
+ * d vm_domainset_lock
+ * a atomic
+ * c const after boot
+ * q page queue lock
*/
struct vm_domain {
struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
struct mtx_padalign vmd_free_mtx;
struct mtx_padalign vmd_pageout_mtx;
+ uma_zone_t vmd_pgcache; /* (c) per-domain page free cache. */
struct vmem *vmd_kernel_arena; /* (c) per-domain kva arena. */
u_int vmd_domain; /* (c) Domain number. */
u_int vmd_page_count; /* (c) Total page count. */
@@ -105,9 +117,9 @@
boolean_t vmd_oom;
int vmd_oom_seq;
int vmd_last_active_scan;
- struct vm_page vmd_laundry_marker;
- struct vm_page vmd_marker; /* marker for pagedaemon private use */
+ struct vm_page vmd_markers[PQ_COUNT];
struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
+ struct vm_page vmd_clock[2]; /* markers for active queue scan */
int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */
int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */
@@ -143,6 +155,7 @@
#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
+#define vm_pagequeue_trylock(pq) mtx_trylock(&(pq)->pq_mutex)
#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
#define vm_domain_free_assert_locked(n) \
@@ -153,6 +166,8 @@
mtx_lock(vm_domain_free_lockptr((d)))
#define vm_domain_free_lockptr(d) \
(&(d)->vmd_free_mtx)
+#define vm_domain_free_trylock(d) \
+ mtx_trylock(vm_domain_free_lockptr((d)))
#define vm_domain_free_unlock(d) \
mtx_unlock(vm_domain_free_lockptr((d)))
@@ -171,14 +186,39 @@
vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
{
-#ifdef notyet
vm_pagequeue_assert_locked(pq);
-#endif
pq->pq_cnt += addend;
}
#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1)
#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1)
+static inline void
+vm_batchqueue_init(struct vm_batchqueue *bq)
+{
+
+ bq->bq_cnt = 0;
+}
+
+static inline bool
+vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m)
+{
+
+ if (bq->bq_cnt < nitems(bq->bq_pa)) {
+ bq->bq_pa[bq->bq_cnt++] = m;
+ return (true);
+ }
+ return (false);
+}
+
+static inline vm_page_t
+vm_batchqueue_pop(struct vm_batchqueue *bq)
+{
+
+ if (bq->bq_cnt == 0)
+ return (NULL);
+ return (bq->bq_pa[--bq->bq_cnt]);
+}
+
void vm_domain_set(struct vm_domain *vmd);
void vm_domain_clear(struct vm_domain *vmd);
int vm_domain_allocate(struct vm_domain *vmd, int req, int npages);
Index: sys/vm/vm_phys.h
===================================================================
--- sys/vm/vm_phys.h
+++ sys/vm/vm_phys.h
@@ -78,6 +78,7 @@
vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool,
int order);
vm_page_t vm_phys_alloc_pages(int domain, int pool, int order);
+int vm_phys_alloc_npages(int domain, int pool, vm_page_t *m, int cnt);
int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high);
int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
vm_memattr_t memattr);
Index: sys/vm/vm_phys.c
===================================================================
--- sys/vm/vm_phys.c
+++ sys/vm/vm_phys.c
@@ -354,9 +354,9 @@
m->order = order;
if (tail)
- TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
+ TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
else
- TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
+ TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
fl[order].lcnt++;
}
@@ -364,7 +364,7 @@
vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
{
- TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
+ TAILQ_REMOVE(&fl[order].pl, m, listq);
fl[order].lcnt--;
m->order = VM_NFREEORDER;
}
@@ -624,6 +624,26 @@
return (NULL);
}
+int
+vm_phys_alloc_npages(int domain, int pool, vm_page_t *mp, int cnt)
+{
+ vm_page_t m;
+ int order, freelist;
+
+ for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
+ for (order = fls(cnt) -1; order >= 0; order--) {
+ m = vm_phys_alloc_freelist_pages(domain, freelist,
+ pool, order);
+ if (m != NULL) {
+ *mp = m;
+ return (1 << order);
+ }
+ }
+ }
+ *mp = NULL;
+ return (0);
+}
+
/*
* Allocate a contiguous, power of two-sized set of physical pages from the
* specified free list. The free list must be specified using one of the
@@ -1176,7 +1196,7 @@
oind++) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
fl = (*seg->free_queues)[pind];
- TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+ TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
/*
* Is the size of this allocation request
* larger than the largest block size?
Index: sys/vm/vm_reserv.c
===================================================================
--- sys/vm/vm_reserv.c
+++ sys/vm/vm_reserv.c
@@ -419,7 +419,7 @@
index));
KASSERT(rv->popcnt > 0,
("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
- KASSERT(rv->domain < vm_ndomains,
+ KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
rv, rv->domain));
if (rv->popcnt == VM_LEVEL_0_NPAGES) {
@@ -531,7 +531,7 @@
("vm_reserv_populate: reserv %p is already full", rv));
KASSERT(rv->pages->psind == 0,
("vm_reserv_populate: reserv %p is already promoted", rv));
- KASSERT(rv->domain < vm_ndomains,
+ KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
("vm_reserv_populate: reserv %p's domain is corrupted %d",
rv, rv->domain));
popmap_set(rv->popmap, index);
@@ -1218,7 +1218,7 @@
vm_reserv_domain_lock(rv->domain);
KASSERT(rv->inpartpopq,
("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
- KASSERT(rv->domain < vm_ndomains,
+ KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
("vm_reserv_reclaim: reserv %p's domain is corrupted %d",
rv, rv->domain));
TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
Index: sys/vm/vnode_pager.c
===================================================================
--- sys/vm/vnode_pager.c
+++ sys/vm/vnode_pager.c
@@ -249,6 +249,7 @@
object->un_pager.vnp.vnp_size = size;
object->un_pager.vnp.writemappings = 0;
+ object->iosize = vp->v_mount->mnt_stat.f_iosize;
object->domain.dr_policy = vnode_domainset;
object->handle = handle;
@@ -769,7 +770,7 @@
object = vp->v_object;
foff = IDX_TO_OFF(m[0]->pindex);
- bsize = vp->v_mount->mnt_stat.f_iosize;
+ bsize = object->iosize;
pagesperblock = bsize / PAGE_SIZE;
KASSERT(foff < object->un_pager.vnp.vnp_size,
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Nov 27, 8:02 AM (16 h, 20 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14881479
Default Alt Text
D14893.id40974.diff (68 KB)
Attached To
Mode
D14893: VM page queue batching
Attached
Detach File
Event Timeline
Log In to Comment