Page MenuHomeFreeBSD

D8302.diff
No OneTemporary

D8302.diff

Index: head/sys/sys/vmmeter.h
===================================================================
--- head/sys/sys/vmmeter.h
+++ head/sys/sys/vmmeter.h
@@ -75,9 +75,10 @@
u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */
u_int v_vnodepgsout; /* (p) vnode pager pages paged out */
u_int v_intrans; /* (p) intransit blocking page faults */
- u_int v_reactivated; /* (f) pages reactivated from free list */
+ u_int v_reactivated; /* (p) pages reactivated by the pagedaemon */
u_int v_pdwakeups; /* (p) times daemon has awaken from sleep */
u_int v_pdpages; /* (p) pages analyzed by daemon */
+ u_int v_pdshortfalls; /* (p) page reclamation shortfalls */
u_int v_tcached; /* (p) total pages cached */
u_int v_dfree; /* (p) pages freed by daemon */
@@ -96,6 +97,7 @@
u_int v_active_count; /* (q) pages active */
u_int v_inactive_target; /* (c) pages desired inactive */
u_int v_inactive_count; /* (q) pages inactive */
+ u_int v_laundry_count; /* (q) pages eligible for laundering */
u_int v_cache_count; /* (f) pages on cache queue */
u_int v_pageout_free_min; /* (c) min pages reserved for kernel */
u_int v_interrupt_free_min; /* (c) reserved pages for int code */
@@ -111,7 +113,6 @@
u_int v_vforkpages; /* (p) VM pages affected by vfork() */
u_int v_rforkpages; /* (p) VM pages affected by rfork() */
u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */
- u_int v_spare[2];
};
#ifdef _KERNEL
@@ -184,6 +185,25 @@
vm_pageout_wakeup_thresh);
}
+/*
+ * Return the number of pages we need to launder.
+ * A positive number indicates that we have a shortfall of clean pages.
+ */
+static inline int
+vm_laundry_target(void)
+{
+
+ return (vm_paging_target());
+}
+
+/*
+ * Obtain the value of a per-CPU counter.
+ */
+#define VM_METER_PCPU_CNT(member) \
+ vm_meter_cnt(__offsetof(struct vmmeter, member))
+
+u_int vm_meter_cnt(size_t);
+
#endif
/* systemwide totals computed every five seconds */
Index: head/sys/vm/swap_pager.c
===================================================================
--- head/sys/vm/swap_pager.c
+++ head/sys/vm/swap_pager.c
@@ -1549,17 +1549,18 @@
* For write success, clear the dirty
* status, then finish the I/O ( which decrements the
* busy count and possibly wakes waiter's up ).
+ * A page is only written to swap after a period of
+ * inactivity. Therefore, we do not expect it to be
+ * reused.
*/
KASSERT(!pmap_page_is_write_mapped(m),
("swp_pager_async_iodone: page %p is not write"
" protected", m));
vm_page_undirty(m);
+ vm_page_lock(m);
+ vm_page_deactivate_noreuse(m);
+ vm_page_unlock(m);
vm_page_sunbusy(m);
- if (vm_page_count_severe()) {
- vm_page_lock(m);
- vm_page_try_to_cache(m);
- vm_page_unlock(m);
- }
}
}
@@ -1635,12 +1636,15 @@
/*
* SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
*
- * This routine dissociates the page at the given index within a
- * swap block from its backing store, paging it in if necessary.
- * If the page is paged in, it is placed in the inactive queue,
- * since it had its backing store ripped out from under it.
- * We also attempt to swap in all other pages in the swap block,
- * we only guarantee that the one at the specified index is
+ * This routine dissociates the page at the given index within an object
+ * from its backing store, paging it in if it does not reside in memory.
+ * If the page is paged in, it is marked dirty and placed in the laundry
+ * queue. The page is marked dirty because it no longer has backing
+ * store. It is placed in the laundry queue because it has not been
+ * accessed recently. Otherwise, it would already reside in memory.
+ *
+ * We also attempt to swap in all other pages in the swap block.
+ * However, we only guarantee that the one at the specified index is
* paged in.
*
* XXX - The code to page the whole block in doesn't work, so we
@@ -1669,7 +1673,7 @@
vm_object_pip_wakeup(object);
vm_page_dirty(m);
vm_page_lock(m);
- vm_page_deactivate(m);
+ vm_page_launder(m);
vm_page_unlock(m);
vm_page_xunbusy(m);
vm_pager_page_unswapped(m);
Index: head/sys/vm/vm_fault.c
===================================================================
--- head/sys/vm/vm_fault.c
+++ head/sys/vm/vm_fault.c
@@ -290,12 +290,13 @@
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
int fault_flags, vm_page_t *m_hold)
{
- vm_prot_t prot;
- vm_object_t next_object;
struct faultstate fs;
struct vnode *vp;
+ vm_object_t next_object, retry_object;
vm_offset_t e_end, e_start;
vm_page_t m;
+ vm_pindex_t retry_pindex;
+ vm_prot_t prot, retry_prot;
int ahead, alloc_req, behind, cluster_offset, error, era, faultcount;
int locked, map_generation, nera, result, rv;
u_char behavior;
@@ -946,10 +947,6 @@
* lookup.
*/
if (!fs.lookup_still_valid) {
- vm_object_t retry_object;
- vm_pindex_t retry_pindex;
- vm_prot_t retry_prot;
-
if (!vm_map_trylock_read(fs.map)) {
release_page(&fs);
unlock_and_deallocate(&fs);
Index: head/sys/vm/vm_meter.c
===================================================================
--- head/sys/vm/vm_meter.c
+++ head/sys/vm/vm_meter.c
@@ -216,29 +216,37 @@
}
/*
- * vcnt() - accumulate statistics from all cpus and the global cnt
- * structure.
+ * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt
+ * structure.
*
* The vmmeter structure is now per-cpu as well as global. Those
* statistics which can be kept on a per-cpu basis (to avoid cache
* stalls between cpus) can be moved to the per-cpu vmmeter. Remaining
* statistics, such as v_free_reserved, are left in the global
* structure.
- *
- * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
*/
-static int
-vcnt(SYSCTL_HANDLER_ARGS)
+u_int
+vm_meter_cnt(size_t offset)
{
- int count = *(int *)arg1;
- int offset = (char *)arg1 - (char *)&vm_cnt;
+ struct pcpu *pcpu;
+ u_int count;
int i;
+ count = *(u_int *)((char *)&vm_cnt + offset);
CPU_FOREACH(i) {
- struct pcpu *pcpu = pcpu_find(i);
- count += *(int *)((char *)&pcpu->pc_cnt + offset);
+ pcpu = pcpu_find(i);
+ count += *(u_int *)((char *)&pcpu->pc_cnt + offset);
}
- return (SYSCTL_OUT(req, &count, sizeof(int)));
+ return (count);
+}
+
+static int
+cnt_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ u_int count;
+
+ count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt);
+ return (SYSCTL_OUT(req, &count, sizeof(count)));
}
SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
@@ -253,8 +261,8 @@
#define VM_STATS(parent, var, descr) \
SYSCTL_PROC(parent, OID_AUTO, var, \
- CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, vcnt, \
- "IU", descr)
+ CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \
+ cnt_sysctl, "IU", descr)
#define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr)
#define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr)
@@ -278,9 +286,10 @@
VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
VM_STATS_VM(v_intrans, "In transit page faults");
-VM_STATS_VM(v_reactivated, "Pages reactivated from free list");
+VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon");
VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon");
+VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls");
VM_STATS_VM(v_tcached, "Total pages cached");
VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
@@ -295,6 +304,7 @@
VM_STATS_VM(v_active_count, "Active pages");
VM_STATS_VM(v_inactive_target, "Desired inactive pages");
VM_STATS_VM(v_inactive_count, "Inactive pages");
+VM_STATS_VM(v_laundry_count, "Pages eligible for laundering");
VM_STATS_VM(v_cache_count, "Pages on cache queue");
VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code");
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c
+++ head/sys/vm/vm_object.c
@@ -2329,9 +2329,9 @@
* sysctl is only meant to give an
* approximation of the system anyway.
*/
- if (m->queue == PQ_ACTIVE)
+ if (vm_page_active(m))
kvo.kvo_active++;
- else if (m->queue == PQ_INACTIVE)
+ else if (vm_page_inactive(m))
kvo.kvo_inactive++;
}
Index: head/sys/vm/vm_page.h
===================================================================
--- head/sys/vm/vm_page.h
+++ head/sys/vm/vm_page.h
@@ -206,7 +206,8 @@
#define PQ_NONE 255
#define PQ_INACTIVE 0
#define PQ_ACTIVE 1
-#define PQ_COUNT 2
+#define PQ_LAUNDRY 2
+#define PQ_COUNT 3
TAILQ_HEAD(pglist, vm_page);
SLIST_HEAD(spglist, vm_page);
@@ -228,6 +229,7 @@
boolean_t vmd_oom;
int vmd_oom_seq;
int vmd_last_active_scan;
+ struct vm_page vmd_laundry_marker;
struct vm_page vmd_marker; /* marker for pagedaemon private use */
struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
};
@@ -236,6 +238,7 @@
#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED)
#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex)
+#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex)
#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex)
#ifdef _KERNEL
@@ -327,7 +330,6 @@
#define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */
#define PG_ZERO 0x0008 /* page is zeroed */
#define PG_MARKER 0x0010 /* special queue marker page */
-#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */
#define PG_NODUMP 0x0080 /* don't include this page in a dump */
#define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */
@@ -451,10 +453,8 @@
vm_paddr_t boundary, vm_memattr_t memattr);
vm_page_t vm_page_alloc_freelist(int, int);
vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
-void vm_page_cache(vm_page_t);
void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
-int vm_page_try_to_cache (vm_page_t);
int vm_page_try_to_free (vm_page_t);
void vm_page_deactivate (vm_page_t);
void vm_page_deactivate_noreuse(vm_page_t);
@@ -465,6 +465,7 @@
void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
+void vm_page_launder(vm_page_t m);
vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
vm_page_t vm_page_next(vm_page_t m);
int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
@@ -697,5 +698,26 @@
(void)mret;
}
+static inline bool
+vm_page_active(vm_page_t m)
+{
+
+ return (m->queue == PQ_ACTIVE);
+}
+
+static inline bool
+vm_page_inactive(vm_page_t m)
+{
+
+ return (m->queue == PQ_INACTIVE);
+}
+
+static inline bool
+vm_page_in_laundry(vm_page_t m)
+{
+
+ return (m->queue == PQ_LAUNDRY);
+}
+
#endif /* _KERNEL */
#endif /* !_VM_PAGE_ */
Index: head/sys/vm/vm_page.c
===================================================================
--- head/sys/vm/vm_page.c
+++ head/sys/vm/vm_page.c
@@ -390,6 +390,10 @@
"vm active pagequeue";
*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
&vm_cnt.v_active_count;
+ *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
+ "vm laundry pagequeue";
+ *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
+ &vm_cnt.v_laundry_count;
vmd->vmd_page_count = 0;
vmd->vmd_free_count = 0;
vmd->vmd_segs = 0;
@@ -1730,9 +1734,7 @@
("vm_page_alloc: cached page %p is PG_ZERO", m));
KASSERT(m->valid != 0,
("vm_page_alloc: cached page %p is invalid", m));
- if (m->object == object && m->pindex == pindex)
- vm_cnt.v_reactivated++;
- else
+ if (m->object != object || m->pindex != pindex)
m->valid = 0;
m_object = m->object;
vm_page_cache_remove(m);
@@ -2254,7 +2256,7 @@
}
KASSERT((m->flags & PG_UNHOLDFREE) == 0,
("page %p is PG_UNHOLDFREE", m));
- /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */
+ /* Don't care: PG_NODUMP, PG_ZERO. */
if (object->type != OBJT_DEFAULT &&
object->type != OBJT_SWAP &&
object->type != OBJT_VNODE)
@@ -2450,7 +2452,7 @@
}
KASSERT((m->flags & PG_UNHOLDFREE) == 0,
("page %p is PG_UNHOLDFREE", m));
- /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */
+ /* Don't care: PG_NODUMP, PG_ZERO. */
if (object->type != OBJT_DEFAULT &&
object->type != OBJT_SWAP &&
object->type != OBJT_VNODE)
@@ -2778,7 +2780,10 @@
vm_page_pagequeue(vm_page_t m)
{
- return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+ if (vm_page_in_laundry(m))
+ return (&vm_dom[0].vmd_pagequeues[m->queue]);
+ else
+ return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
}
/*
@@ -2840,7 +2845,10 @@
KASSERT(queue < PQ_COUNT,
("vm_page_enqueue: invalid queue %u request for page %p",
queue, m));
- pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
+ if (queue == PQ_LAUNDRY)
+ pq = &vm_dom[0].vmd_pagequeues[queue];
+ else
+ pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
vm_pagequeue_lock(pq);
m->queue = queue;
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
@@ -3124,11 +3132,8 @@
if (m->wire_count == 0) {
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
if ((m->oflags & VPO_UNMANAGED) == 0 &&
- m->object != NULL && queue != PQ_NONE) {
- if (queue == PQ_INACTIVE)
- m->flags &= ~PG_WINATCFLS;
+ m->object != NULL && queue != PQ_NONE)
vm_page_enqueue(queue, m);
- }
return (TRUE);
} else
return (FALSE);
@@ -3181,7 +3186,6 @@
} else {
if (queue != PQ_NONE)
vm_page_dequeue(m);
- m->flags &= ~PG_WINATCFLS;
vm_pagequeue_lock(pq);
}
m->queue = PQ_INACTIVE;
@@ -3221,24 +3225,25 @@
}
/*
- * vm_page_try_to_cache:
+ * vm_page_launder
*
- * Returns 0 on failure, 1 on success
+ * Put a page in the laundry.
*/
-int
-vm_page_try_to_cache(vm_page_t m)
+void
+vm_page_launder(vm_page_t m)
{
+ int queue;
- vm_page_lock_assert(m, MA_OWNED);
- VM_OBJECT_ASSERT_WLOCKED(m->object);
- if (m->dirty || m->hold_count || m->wire_count ||
- (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
- return (0);
- pmap_remove_all(m);
- if (m->dirty)
- return (0);
- vm_page_cache(m);
- return (1);
+ vm_page_assert_locked(m);
+ if ((queue = m->queue) != PQ_LAUNDRY) {
+ if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
+ if (queue != PQ_NONE)
+ vm_page_dequeue(m);
+ vm_page_enqueue(PQ_LAUNDRY, m);
+ } else
+ KASSERT(queue == PQ_NONE,
+ ("wired page %p is queued", m));
+ }
}
/*
@@ -3265,112 +3270,6 @@
}
/*
- * vm_page_cache
- *
- * Put the specified page onto the page cache queue (if appropriate).
- *
- * The object and page must be locked.
- */
-void
-vm_page_cache(vm_page_t m)
-{
- vm_object_t object;
- boolean_t cache_was_empty;
-
- vm_page_lock_assert(m, MA_OWNED);
- object = m->object;
- VM_OBJECT_ASSERT_WLOCKED(object);
- if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
- m->hold_count || m->wire_count)
- panic("vm_page_cache: attempting to cache busy page");
- KASSERT(!pmap_page_is_mapped(m),
- ("vm_page_cache: page %p is mapped", m));
- KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
- if (m->valid == 0 || object->type == OBJT_DEFAULT ||
- (object->type == OBJT_SWAP &&
- !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
- /*
- * Hypothesis: A cache-eligible page belonging to a
- * default object or swap object but without a backing
- * store must be zero filled.
- */
- vm_page_free(m);
- return;
- }
- KASSERT((m->flags & PG_CACHED) == 0,
- ("vm_page_cache: page %p is already cached", m));
-
- /*
- * Remove the page from the paging queues.
- */
- vm_page_remque(m);
-
- /*
- * Remove the page from the object's collection of resident
- * pages.
- */
- vm_radix_remove(&object->rtree, m->pindex);
- TAILQ_REMOVE(&object->memq, m, listq);
- object->resident_page_count--;
-
- /*
- * Restore the default memory attribute to the page.
- */
- if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
- pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
-
- /*
- * Insert the page into the object's collection of cached pages
- * and the physical memory allocator's cache/free page queues.
- */
- m->flags &= ~PG_ZERO;
- mtx_lock(&vm_page_queue_free_mtx);
- cache_was_empty = vm_radix_is_empty(&object->cache);
- if (vm_radix_insert(&object->cache, m)) {
- mtx_unlock(&vm_page_queue_free_mtx);
- if (object->type == OBJT_VNODE &&
- object->resident_page_count == 0)
- vdrop(object->handle);
- m->object = NULL;
- vm_page_free(m);
- return;
- }
-
- /*
- * The above call to vm_radix_insert() could reclaim the one pre-
- * existing cached page from this object, resulting in a call to
- * vdrop().
- */
- if (!cache_was_empty)
- cache_was_empty = vm_radix_is_singleton(&object->cache);
-
- m->flags |= PG_CACHED;
- vm_cnt.v_cache_count++;
- PCPU_INC(cnt.v_tcached);
-#if VM_NRESERVLEVEL > 0
- if (!vm_reserv_free_page(m)) {
-#else
- if (TRUE) {
-#endif
- vm_phys_free_pages(m, 0);
- }
- vm_page_free_wakeup();
- mtx_unlock(&vm_page_queue_free_mtx);
-
- /*
- * Increment the vnode's hold count if this is the object's only
- * cached page. Decrement the vnode's hold count if this was
- * the object's only resident page.
- */
- if (object->type == OBJT_VNODE) {
- if (cache_was_empty && object->resident_page_count != 0)
- vhold(object->handle);
- else if (!cache_was_empty && object->resident_page_count == 0)
- vdrop(object->handle);
- }
-}
-
-/*
* vm_page_advise
*
* Deactivate or do nothing, as appropriate.
@@ -3413,11 +3312,13 @@
/*
* Place clean pages near the head of the inactive queue rather than
* the tail, thus defeating the queue's LRU operation and ensuring that
- * the page will be reused quickly. Dirty pages are given a chance to
- * cycle once through the inactive queue before becoming eligible for
- * laundering.
+ * the page will be reused quickly. Dirty pages not already in the
+ * laundry are moved there.
*/
- _vm_page_deactivate(m, m->dirty == 0);
+ if (m->dirty == 0)
+ vm_page_deactivate_noreuse(m);
+ else
+ vm_page_launder(m);
}
/*
@@ -3926,6 +3827,7 @@
db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count);
db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
+ db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
@@ -3940,12 +3842,14 @@
db_printf("pq_free %d pq_cache %d\n",
vm_cnt.v_free_count, vm_cnt.v_cache_count);
for (dom = 0; dom < vm_ndomains; dom++) {
- db_printf("dom %d page_cnt %d free %d pq_act %d pq_inact %d\n",
+ db_printf(
+ "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
dom,
vm_dom[dom].vmd_page_count,
vm_dom[dom].vmd_free_count,
vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
- vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt);
+ vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
+ vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
}
}
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c
+++ head/sys/vm/vm_pageout.c
@@ -119,7 +119,7 @@
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
static void vm_pageout_init(void);
-static int vm_pageout_clean(vm_page_t m);
+static int vm_pageout_clean(vm_page_t m, int *numpagedout);
static int vm_pageout_cluster(vm_page_t m);
static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
@@ -154,6 +154,9 @@
SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
#endif
+/* Pagedaemon activity rates, in subdivisions of one second. */
+#define VM_LAUNDER_RATE 10
+#define VM_INACT_SCAN_RATE 2
int vm_pageout_deficit; /* Estimated number of pages deficit */
u_int vm_pageout_wakeup_thresh;
@@ -161,6 +164,13 @@
bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */
bool vm_pages_needed; /* Are threads waiting for free pages? */
+/* Pending request for dirty page laundering. */
+static enum {
+ VM_LAUNDRY_IDLE,
+ VM_LAUNDRY_BACKGROUND,
+ VM_LAUNDRY_SHORTFALL
+} vm_laundry_request = VM_LAUNDRY_IDLE;
+
#if !defined(NO_SWAPPING)
static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
@@ -168,9 +178,7 @@
/* Allow for use by vm_pageout before vm_daemon is initialized. */
MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
#endif
-static int vm_max_launder = 32;
static int vm_pageout_update_period;
-static int defer_swap_pageouts;
static int disable_swap_pageouts;
static int lowmem_period = 10;
static time_t lowmem_uptime;
@@ -193,9 +201,6 @@
CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
"free page threshold for waking up the pageout daemon");
-SYSCTL_INT(_vm, OID_AUTO, max_launder,
- CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
-
SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
CTLFLAG_RW, &vm_pageout_update_period, 0,
"Maximum active LRU update period");
@@ -215,9 +220,6 @@
CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
#endif
-SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
- CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
-
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
@@ -229,6 +231,25 @@
CTLFLAG_RW, &vm_pageout_oom_seq, 0,
"back-to-back calls to oom detector to start OOM");
+static int act_scan_laundry_weight = 3;
+SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW,
+ &act_scan_laundry_weight, 0,
+ "weight given to clean vs. dirty pages in active queue scans");
+
+static u_int vm_background_launder_target;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW,
+ &vm_background_launder_target, 0,
+ "background laundering target, in pages");
+
+static u_int vm_background_launder_rate = 4096;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW,
+ &vm_background_launder_rate, 0,
+ "background laundering rate, in kilobytes per second");
+
+static u_int vm_background_launder_max = 20 * 1024;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW,
+ &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
+
#define VM_PAGEOUT_PAGE_COUNT 16
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
@@ -236,7 +257,11 @@
SYSCTL_INT(_vm, OID_AUTO, max_wired,
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
+static u_int isqrt(u_int num);
static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
+static int vm_pageout_launder(struct vm_domain *vmd, int launder,
+ bool in_shortfall);
+static void vm_pageout_laundry_worker(void *arg);
#if !defined(NO_SWAPPING)
static void vm_pageout_map_deactivate_pages(vm_map_t, long);
static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
@@ -387,7 +412,7 @@
/*
* We can cluster only if the page is not clean, busy, or held, and
- * the page is inactive.
+ * the page is in the laundry queue.
*
* During heavy mmap/modification loads the pageout
* daemon can really fragment the underlying file
@@ -413,7 +438,7 @@
break;
}
vm_page_lock(p);
- if (p->queue != PQ_INACTIVE ||
+ if (!vm_page_in_laundry(p) ||
p->hold_count != 0) { /* may be undergoing I/O */
vm_page_unlock(p);
ib = 0;
@@ -439,7 +464,7 @@
if (p->dirty == 0)
break;
vm_page_lock(p);
- if (p->queue != PQ_INACTIVE ||
+ if (!vm_page_in_laundry(p) ||
p->hold_count != 0) { /* may be undergoing I/O */
vm_page_unlock(p);
break;
@@ -519,23 +544,33 @@
("vm_pageout_flush: page %p is not write protected", mt));
switch (pageout_status[i]) {
case VM_PAGER_OK:
+ vm_page_lock(mt);
+ if (vm_page_in_laundry(mt))
+ vm_page_deactivate_noreuse(mt);
+ vm_page_unlock(mt);
+ /* FALLTHROUGH */
case VM_PAGER_PEND:
numpagedout++;
break;
case VM_PAGER_BAD:
/*
- * Page outside of range of object. Right now we
- * essentially lose the changes by pretending it
- * worked.
+ * The page is outside the object's range. We pretend
+ * that the page out worked and clean the page, so the
+ * changes will be lost if the page is reclaimed by
+ * the page daemon.
*/
vm_page_undirty(mt);
+ vm_page_lock(mt);
+ if (vm_page_in_laundry(mt))
+ vm_page_deactivate_noreuse(mt);
+ vm_page_unlock(mt);
break;
case VM_PAGER_ERROR:
case VM_PAGER_FAIL:
/*
- * If page couldn't be paged out, then reactivate the
- * page so it doesn't clog the inactive list. (We
- * will try paging out it again later).
+ * If the page couldn't be paged out, then reactivate
+ * it so that it doesn't clog the laundry and inactive
+ * queues. (We will try paging it out again later).
*/
vm_page_lock(mt);
vm_page_activate(mt);
@@ -617,10 +652,10 @@
act_delta = 1;
vm_page_aflag_clear(p, PGA_REFERENCED);
}
- if (p->queue != PQ_ACTIVE && act_delta != 0) {
+ if (!vm_page_active(p) && act_delta != 0) {
vm_page_activate(p);
p->act_count += act_delta;
- } else if (p->queue == PQ_ACTIVE) {
+ } else if (vm_page_active(p)) {
if (act_delta == 0) {
p->act_count -= min(p->act_count,
ACT_DECLINE);
@@ -636,7 +671,7 @@
p->act_count += ACT_ADVANCE;
vm_page_requeue(p);
}
- } else if (p->queue == PQ_INACTIVE)
+ } else if (vm_page_inactive(p))
pmap_remove_all(p);
vm_page_unlock(p);
}
@@ -739,7 +774,7 @@
* Returns 0 on success and an errno otherwise.
*/
static int
-vm_pageout_clean(vm_page_t m)
+vm_pageout_clean(vm_page_t m, int *numpagedout)
{
struct vnode *vp;
struct mount *mp;
@@ -797,7 +832,7 @@
* (3) reallocated to a different offset, or
* (4) cleaned.
*/
- if (m->queue != PQ_INACTIVE || m->object != object ||
+ if (!vm_page_in_laundry(m) || m->object != object ||
m->pindex != pindex || m->dirty == 0) {
vm_page_unlock(m);
error = ENXIO;
@@ -821,7 +856,7 @@
* laundry. If it is still in the laundry, then we
* start the cleaning operation.
*/
- if (vm_pageout_cluster(m) == 0)
+ if ((*numpagedout = vm_pageout_cluster(m)) == 0)
error = EIO;
unlock_all:
@@ -840,11 +875,390 @@
}
/*
+ * Attempt to launder the specified number of pages.
+ *
+ * Returns the number of pages successfully laundered.
+ */
+static int
+vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
+{
+ struct vm_pagequeue *pq;
+ vm_object_t object;
+ vm_page_t m, next;
+ int act_delta, error, maxscan, numpagedout, starting_target;
+ int vnodes_skipped;
+ bool pageout_ok, queue_locked;
+
+ starting_target = launder;
+ vnodes_skipped = 0;
+
+ /*
+ * Scan the laundry queue for pages eligible to be laundered. We stop
+ * once the target number of dirty pages have been laundered, or once
+ * we've reached the end of the queue. A single iteration of this loop
+ * may cause more than one page to be laundered because of clustering.
+ *
+ * maxscan ensures that we don't re-examine requeued pages. Any
+ * additional pages written as part of a cluster are subtracted from
+ * maxscan since they must be taken from the laundry queue.
+ */
+ pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
+ maxscan = pq->pq_cnt;
+
+ vm_pagequeue_lock(pq);
+ queue_locked = true;
+ for (m = TAILQ_FIRST(&pq->pq_pl);
+ m != NULL && maxscan-- > 0 && launder > 0;
+ m = next) {
+ vm_pagequeue_assert_locked(pq);
+ KASSERT(queue_locked, ("unlocked laundry queue"));
+ KASSERT(vm_page_in_laundry(m),
+ ("page %p has an inconsistent queue", m));
+ next = TAILQ_NEXT(m, plinks.q);
+ if ((m->flags & PG_MARKER) != 0)
+ continue;
+ KASSERT((m->flags & PG_FICTITIOUS) == 0,
+ ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
+ KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+ ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
+ if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
+ vm_page_unlock(m);
+ continue;
+ }
+ object = m->object;
+ if ((!VM_OBJECT_TRYWLOCK(object) &&
+ (!vm_pageout_fallback_object_lock(m, &next) ||
+ m->hold_count != 0)) || vm_page_busied(m)) {
+ VM_OBJECT_WUNLOCK(object);
+ vm_page_unlock(m);
+ continue;
+ }
+
+ /*
+ * Unlock the laundry queue, invalidating the 'next' pointer.
+ * Use a marker to remember our place in the laundry queue.
+ */
+ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
+ plinks.q);
+ vm_pagequeue_unlock(pq);
+ queue_locked = false;
+
+ /*
+ * Invalid pages can be easily freed. They cannot be
+ * mapped; vm_page_free() asserts this.
+ */
+ if (m->valid == 0)
+ goto free_page;
+
+ /*
+ * If the page has been referenced and the object is not dead,
+ * reactivate or requeue the page depending on whether the
+ * object is mapped.
+ */
+ if ((m->aflags & PGA_REFERENCED) != 0) {
+ vm_page_aflag_clear(m, PGA_REFERENCED);
+ act_delta = 1;
+ } else
+ act_delta = 0;
+ if (object->ref_count != 0)
+ act_delta += pmap_ts_referenced(m);
+ else {
+ KASSERT(!pmap_page_is_mapped(m),
+ ("page %p is mapped", m));
+ }
+ if (act_delta != 0) {
+ if (object->ref_count != 0) {
+ PCPU_INC(cnt.v_reactivated);
+ vm_page_activate(m);
+
+ /*
+ * Increase the activation count if the page
+ * was referenced while in the laundry queue.
+ * This makes it less likely that the page will
+ * be returned prematurely to the inactive
+ * queue.
+ */
+ m->act_count += act_delta + ACT_ADVANCE;
+
+ /*
+ * If this was a background laundering, count
+ * activated pages towards our target. The
+ * purpose of background laundering is to ensure
+ * that pages are eventually cycled through the
+ * laundry queue, and an activation is a valid
+ * way out.
+ */
+ if (!in_shortfall)
+ launder--;
+ goto drop_page;
+ } else if ((object->flags & OBJ_DEAD) == 0)
+ goto requeue_page;
+ }
+
+ /*
+ * If the page appears to be clean at the machine-independent
+ * layer, then remove all of its mappings from the pmap in
+ * anticipation of freeing it. If, however, any of the page's
+ * mappings allow write access, then the page may still be
+ * modified until the last of those mappings are removed.
+ */
+ if (object->ref_count != 0) {
+ vm_page_test_dirty(m);
+ if (m->dirty == 0)
+ pmap_remove_all(m);
+ }
+
+ /*
+ * Clean pages are freed, and dirty pages are paged out unless
+ * they belong to a dead object. Requeueing dirty pages from
+ * dead objects is pointless, as they are being paged out and
+ * freed by the thread that destroyed the object.
+ */
+ if (m->dirty == 0) {
+free_page:
+ vm_page_free(m);
+ PCPU_INC(cnt.v_dfree);
+ } else if ((object->flags & OBJ_DEAD) == 0) {
+ if (object->type != OBJT_SWAP &&
+ object->type != OBJT_DEFAULT)
+ pageout_ok = true;
+ else if (disable_swap_pageouts)
+ pageout_ok = false;
+ else
+ pageout_ok = true;
+ if (!pageout_ok) {
+requeue_page:
+ vm_pagequeue_lock(pq);
+ queue_locked = true;
+ vm_page_requeue_locked(m);
+ goto drop_page;
+ }
+
+ /*
+ * Form a cluster with adjacent, dirty pages from the
+ * same object, and page out that entire cluster.
+ *
+ * The adjacent, dirty pages must also be in the
+ * laundry. However, their mappings are not checked
+ * for new references. Consequently, a recently
+ * referenced page may be paged out. However, that
+ * page will not be prematurely reclaimed. After page
+ * out, the page will be placed in the inactive queue,
+ * where any new references will be detected and the
+ * page reactivated.
+ */
+ error = vm_pageout_clean(m, &numpagedout);
+ if (error == 0) {
+ launder -= numpagedout;
+ maxscan -= numpagedout - 1;
+ } else if (error == EDEADLK) {
+ pageout_lock_miss++;
+ vnodes_skipped++;
+ }
+ goto relock_queue;
+ }
+drop_page:
+ vm_page_unlock(m);
+ VM_OBJECT_WUNLOCK(object);
+relock_queue:
+ if (!queue_locked) {
+ vm_pagequeue_lock(pq);
+ queue_locked = true;
+ }
+ next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
+ TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
+ }
+ vm_pagequeue_unlock(pq);
+
+ /*
+ * Wakeup the sync daemon if we skipped a vnode in a writeable object
+ * and we didn't launder enough pages.
+ */
+ if (vnodes_skipped > 0 && launder > 0)
+ (void)speedup_syncer();
+
+ return (starting_target - launder);
+}
+
+/*
+ * Compute the integer square root.
+ */
+static u_int
+isqrt(u_int num)
+{
+ u_int bit, root, tmp;
+
+ bit = 1u << ((NBBY * sizeof(u_int)) - 2);
+ while (bit > num)
+ bit >>= 2;
+ root = 0;
+ while (bit != 0) {
+ tmp = root + bit;
+ root >>= 1;
+ if (num >= tmp) {
+ num -= tmp;
+ root += bit;
+ }
+ bit >>= 2;
+ }
+ return (root);
+}
+
+/*
+ * Perform the work of the laundry thread: periodically wake up and determine
+ * whether any pages need to be laundered. If so, determine the number of pages
+ * that need to be laundered, and launder them.
+ */
+static void
+vm_pageout_laundry_worker(void *arg)
+{
+ struct vm_domain *domain;
+ struct vm_pagequeue *pq;
+ uint64_t nclean, ndirty;
+ u_int last_launder, wakeups;
+ int domidx, last_target, launder, shortfall, shortfall_cycle, target;
+ bool in_shortfall;
+
+ domidx = (uintptr_t)arg;
+ domain = &vm_dom[domidx];
+ pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
+ KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+ vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
+
+ shortfall = 0;
+ in_shortfall = false;
+ shortfall_cycle = 0;
+ target = 0;
+ last_launder = 0;
+
+ /*
+ * The pageout laundry worker is never done, so loop forever.
+ */
+ for (;;) {
+ KASSERT(target >= 0, ("negative target %d", target));
+ KASSERT(shortfall_cycle >= 0,
+ ("negative cycle %d", shortfall_cycle));
+ launder = 0;
+ wakeups = VM_METER_PCPU_CNT(v_pdwakeups);
+
+ /*
+ * First determine whether we need to launder pages to meet a
+ * shortage of free pages.
+ */
+ if (shortfall > 0) {
+ in_shortfall = true;
+ shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
+ target = shortfall;
+ } else if (!in_shortfall)
+ goto trybackground;
+ else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
+ /*
+ * We recently entered shortfall and began laundering
+ * pages. If we have completed that laundering run
+ * (and we are no longer in shortfall) or we have met
+ * our laundry target through other activity, then we
+ * can stop laundering pages.
+ */
+ in_shortfall = false;
+ target = 0;
+ goto trybackground;
+ }
+ last_launder = wakeups;
+ launder = target / shortfall_cycle--;
+ goto dolaundry;
+
+ /*
+ * There's no immediate need to launder any pages; see if we
+ * meet the conditions to perform background laundering:
+ *
+ * 1. The ratio of dirty to clean inactive pages exceeds the
+ * background laundering threshold and the pagedaemon has
+ * been woken up to reclaim pages since our last
+ * laundering, or
+ * 2. we haven't yet reached the target of the current
+ * background laundering run.
+ *
+ * The background laundering threshold is not a constant.
+ * Instead, it is a slowly growing function of the number of
+ * page daemon wakeups since the last laundering. Thus, as the
+ * ratio of dirty to clean inactive pages grows, the amount of
+ * memory pressure required to trigger laundering decreases.
+ */
+trybackground:
+ nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
+ ndirty = vm_cnt.v_laundry_count;
+ if (target == 0 && wakeups != last_launder &&
+ ndirty * isqrt(wakeups - last_launder) >= nclean) {
+ target = vm_background_launder_target;
+ }
+
+ /*
+ * We have a non-zero background laundering target. If we've
+ * laundered up to our maximum without observing a page daemon
+ * wakeup, just stop. This is a safety belt that ensures we
+ * don't launder an excessive amount if memory pressure is low
+ * and the ratio of dirty to clean pages is large. Otherwise,
+ * proceed at the background laundering rate.
+ */
+ if (target > 0) {
+ if (wakeups != last_launder) {
+ last_launder = wakeups;
+ last_target = target;
+ } else if (last_target - target >=
+ vm_background_launder_max * PAGE_SIZE / 1024) {
+ target = 0;
+ }
+ launder = vm_background_launder_rate * PAGE_SIZE / 1024;
+ launder /= VM_LAUNDER_RATE;
+ if (launder > target)
+ launder = target;
+ }
+
+dolaundry:
+ if (launder > 0) {
+ /*
+ * Because of I/O clustering, the number of laundered
+ * pages could exceed "target" by the maximum size of
+ * a cluster minus one.
+ */
+ target -= min(vm_pageout_launder(domain, launder,
+ in_shortfall), target);
+ pause("laundp", hz / VM_LAUNDER_RATE);
+ }
+
+ /*
+ * If we're not currently laundering pages and the page daemon
+ * hasn't posted a new request, sleep until the page daemon
+ * kicks us.
+ */
+ vm_pagequeue_lock(pq);
+ if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
+ (void)mtx_sleep(&vm_laundry_request,
+ vm_pagequeue_lockptr(pq), PVM, "launds", 0);
+
+ /*
+ * If the pagedaemon has indicated that it's in shortfall, start
+ * a shortfall laundering unless we're already in the middle of
+ * one. This may preempt a background laundering.
+ */
+ if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
+ (!in_shortfall || shortfall_cycle == 0)) {
+ shortfall = vm_laundry_target() + vm_pageout_deficit;
+ target = 0;
+ } else
+ shortfall = 0;
+
+ if (target == 0)
+ vm_laundry_request = VM_LAUNDRY_IDLE;
+ vm_pagequeue_unlock(pq);
+ }
+}
+
+/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*
- * pass 0 - Update active LRU/deactivate pages
- * pass 1 - Free inactive pages
- * pass 2 - Launder dirty pages
+ * pass == 0: Update active LRU/deactivate pages
+ * pass >= 1: Free inactive pages
*
* Returns true if pass was zero or enough pages were freed by the inactive
* queue scan to meet the target.
@@ -856,10 +1270,9 @@
struct vm_pagequeue *pq;
vm_object_t object;
long min_scan;
- int act_delta, addl_page_shortage, deficit, error, inactq_shortage;
- int maxlaunder, maxscan, page_shortage, scan_tick, scanned;
- int starting_page_shortage, vnodes_skipped;
- boolean_t pageout_ok, queue_locked;
+ int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
+ int page_shortage, scan_tick, scanned, starting_page_shortage;
+ boolean_t queue_locked;
/*
* If we need to reclaim memory ask kernel caches to return
@@ -901,23 +1314,6 @@
starting_page_shortage = page_shortage;
/*
- * maxlaunder limits the number of dirty pages we flush per scan.
- * For most systems a smaller value (16 or 32) is more robust under
- * extreme memory and disk pressure because any unnecessary writes
- * to disk can result in extreme performance degredation. However,
- * systems with excessive dirty pages (especially when MAP_NOSYNC is
- * used) will die horribly with limited laundering. If the pageout
- * daemon cannot clean enough pages in the first pass, we let it go
- * all out in succeeding passes.
- */
- if ((maxlaunder = vm_max_launder) <= 1)
- maxlaunder = 1;
- if (pass > 1)
- maxlaunder = 10000;
-
- vnodes_skipped = 0;
-
- /*
* Start scanning the inactive queue for pages that we can free. The
* scan will stop when we reach the target or we have scanned the
* entire queue. (Note that m->act_count is not used to make
@@ -932,7 +1328,7 @@
m = next) {
vm_pagequeue_assert_locked(pq);
KASSERT(queue_locked, ("unlocked inactive queue"));
- KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
+ KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
PCPU_INC(cnt.v_pdpages);
next = TAILQ_NEXT(m, plinks.q);
@@ -995,11 +1391,15 @@
KASSERT(m->hold_count == 0, ("Held page %p", m));
/*
- * We unlock the inactive page queue, invalidating the
- * 'next' pointer. Use our marker to remember our
- * place.
+ * Dequeue the inactive page and unlock the inactive page
+ * queue, invalidating the 'next' pointer. Dequeueing the
+ * page here avoids a later reacquisition (and release) of
+ * the inactive page queue lock when vm_page_activate(),
+ * vm_page_free(), or vm_page_launder() is called. Use a
+ * marker to remember our place in the inactive queue.
*/
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
+ vm_page_dequeue_locked(m);
vm_pagequeue_unlock(pq);
queue_locked = FALSE;
@@ -1028,6 +1428,7 @@
}
if (act_delta != 0) {
if (object->ref_count != 0) {
+ PCPU_INC(cnt.v_reactivated);
vm_page_activate(m);
/*
@@ -1039,8 +1440,14 @@
*/
m->act_count += act_delta + ACT_ADVANCE;
goto drop_page;
- } else if ((object->flags & OBJ_DEAD) == 0)
- goto requeue_page;
+ } else if ((object->flags & OBJ_DEAD) == 0) {
+ vm_pagequeue_lock(pq);
+ queue_locked = TRUE;
+ m->queue = PQ_INACTIVE;
+ TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+ vm_pagequeue_cnt_inc(pq);
+ goto drop_page;
+ }
}
/*
@@ -1056,83 +1463,23 @@
pmap_remove_all(m);
}
+ /*
+ * Clean pages can be freed, but dirty pages must be sent back
+ * to the laundry, unless they belong to a dead object.
+ * Requeueing dirty pages from dead objects is pointless, as
+ * they are being paged out and freed by the thread that
+ * destroyed the object.
+ */
if (m->dirty == 0) {
- /*
- * Clean pages can be freed.
- */
free_page:
vm_page_free(m);
PCPU_INC(cnt.v_dfree);
--page_shortage;
- } else if ((object->flags & OBJ_DEAD) != 0) {
- /*
- * Leave dirty pages from dead objects at the front of
- * the queue. They are being paged out and freed by
- * the thread that destroyed the object. They will
- * leave the queue shortly after the scan finishes, so
- * they should be discounted from the inactive count.
- */
- addl_page_shortage++;
- } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
- /*
- * Dirty pages need to be paged out, but flushing
- * a page is extremely expensive versus freeing
- * a clean page. Rather then artificially limiting
- * the number of pages we can flush, we instead give
- * dirty pages extra priority on the inactive queue
- * by forcing them to be cycled through the queue
- * twice before being flushed, after which the
- * (now clean) page will cycle through once more
- * before being freed. This significantly extends
- * the thrash point for a heavily loaded machine.
- */
- m->flags |= PG_WINATCFLS;
-requeue_page:
- vm_pagequeue_lock(pq);
- queue_locked = TRUE;
- vm_page_requeue_locked(m);
- } else if (maxlaunder > 0) {
- /*
- * We always want to try to flush some dirty pages if
- * we encounter them, to keep the system stable.
- * Normally this number is small, but under extreme
- * pressure where there are insufficient clean pages
- * on the inactive queue, we may have to go all out.
- */
-
- if (object->type != OBJT_SWAP &&
- object->type != OBJT_DEFAULT)
- pageout_ok = TRUE;
- else if (disable_swap_pageouts)
- pageout_ok = FALSE;
- else if (defer_swap_pageouts)
- pageout_ok = vm_page_count_min();
- else
- pageout_ok = TRUE;
- if (!pageout_ok)
- goto requeue_page;
- error = vm_pageout_clean(m);
- /*
- * Decrement page_shortage on success to account for
- * the (future) cleaned page. Otherwise we could wind
- * up laundering or cleaning too many pages.
- */
- if (error == 0) {
- page_shortage--;
- maxlaunder--;
- } else if (error == EDEADLK) {
- pageout_lock_miss++;
- vnodes_skipped++;
- } else if (error == EBUSY) {
- addl_page_shortage++;
- }
- vm_page_lock_assert(m, MA_NOTOWNED);
- goto relock_queue;
- }
+ } else if ((object->flags & OBJ_DEAD) == 0)
+ vm_page_launder(m);
drop_page:
vm_page_unlock(m);
VM_OBJECT_WUNLOCK(object);
-relock_queue:
if (!queue_locked) {
vm_pagequeue_lock(pq);
queue_locked = TRUE;
@@ -1142,6 +1489,24 @@
}
vm_pagequeue_unlock(pq);
+ /*
+ * Wake up the laundry thread so that it can perform any needed
+ * laundering. If we didn't meet our target, we're in shortfall and
+ * need to launder more aggressively.
+ */
+ if (vm_laundry_request == VM_LAUNDRY_IDLE &&
+ starting_page_shortage > 0) {
+ pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
+ vm_pagequeue_lock(pq);
+ if (page_shortage > 0) {
+ vm_laundry_request = VM_LAUNDRY_SHORTFALL;
+ PCPU_INC(cnt.v_pdshortfalls);
+ } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
+ vm_laundry_request = VM_LAUNDRY_BACKGROUND;
+ wakeup(&vm_laundry_request);
+ vm_pagequeue_unlock(pq);
+ }
+
#if !defined(NO_SWAPPING)
/*
* Wakeup the swapout daemon if we didn't free the targeted number of
@@ -1152,14 +1517,6 @@
#endif
/*
- * Wakeup the sync daemon if we skipped a vnode in a writeable object
- * and we didn't free enough pages.
- */
- if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target -
- vm_cnt.v_free_min)
- (void)speedup_syncer();
-
- /*
* If the inactive queue scan fails repeatedly to meet its
* target, kill the largest process.
*/
@@ -1167,10 +1524,20 @@
/*
* Compute the number of pages we want to try to move from the
- * active queue to the inactive queue.
+ * active queue to either the inactive or laundry queue.
+ *
+ * When scanning active pages, we make clean pages count more heavily
+ * towards the page shortage than dirty pages. This is because dirty
+ * pages must be laundered before they can be reused and thus have less
+ * utility when attempting to quickly alleviate a shortage. However,
+ * this weighting also causes the scan to deactivate dirty pages more
+ * more aggressively, improving the effectiveness of clustering and
+ * ensuring that they can eventually be reused.
*/
- inactq_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count +
+ inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
+ vm_cnt.v_laundry_count / act_scan_laundry_weight) +
vm_paging_target() + deficit + addl_page_shortage;
+ page_shortage *= act_scan_laundry_weight;
pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
vm_pagequeue_lock(pq);
@@ -1254,14 +1621,44 @@
m->act_count -= min(m->act_count, ACT_DECLINE);
/*
- * Move this page to the tail of the active or inactive
+ * Move this page to the tail of the active, inactive or laundry
* queue depending on usage.
*/
if (m->act_count == 0) {
/* Dequeue to avoid later lock recursion. */
vm_page_dequeue_locked(m);
- vm_page_deactivate(m);
- inactq_shortage--;
+
+ /*
+ * When not short for inactive pages, let dirty pages go
+ * through the inactive queue before moving to the
+ * laundry queues. This gives them some extra time to
+ * be reactivated, potentially avoiding an expensive
+ * pageout. During a page shortage, the inactive queue
+ * is necessarily small, so we may move dirty pages
+ * directly to the laundry queue.
+ */
+ if (inactq_shortage <= 0)
+ vm_page_deactivate(m);
+ else {
+ /*
+ * Calling vm_page_test_dirty() here would
+ * require acquisition of the object's write
+ * lock. However, during a page shortage,
+ * directing dirty pages into the laundry
+ * queue is only an optimization and not a
+ * requirement. Therefore, we simply rely on
+ * the opportunistic updates to the page's
+ * dirty field by the pmap.
+ */
+ if (m->dirty == 0) {
+ vm_page_deactivate(m);
+ inactq_shortage -=
+ act_scan_laundry_weight;
+ } else {
+ vm_page_launder(m);
+ inactq_shortage--;
+ }
+ }
} else
vm_page_requeue_locked(m);
vm_page_unlock(m);
@@ -1570,14 +1967,14 @@
* thread during the previous scan, which must have
* been a level 0 scan, or vm_pageout_wanted was
* already set and the scan failed to free enough
- * pages. If we haven't yet performed a level >= 2
- * scan (unlimited dirty cleaning), then upgrade the
- * level and scan again now. Otherwise, sleep a bit
- * and try again later.
+ * pages. If we haven't yet performed a level >= 1
+ * (page reclamation) scan, then increase the level
+ * and scan again now. Otherwise, sleep a bit and
+ * try again later.
*/
mtx_unlock(&vm_page_queue_free_mtx);
- if (pass > 1)
- pause("psleep", hz / 2);
+ if (pass >= 1)
+ pause("psleep", hz / VM_INACT_SCAN_RATE);
pass++;
} else {
/*
@@ -1648,6 +2045,14 @@
/* XXX does not really belong here */
if (vm_page_max_wired == 0)
vm_page_max_wired = vm_cnt.v_free_count / 3;
+
+ /*
+ * Target amount of memory to move out of the laundry queue during a
+ * background laundering. This is proportional to the amount of system
+ * memory.
+ */
+ vm_background_launder_target = (vm_cnt.v_free_target -
+ vm_cnt.v_free_min) / 10;
}
/*
@@ -1662,6 +2067,10 @@
#endif
swap_pager_swap_init();
+ error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
+ 0, 0, "laundry: dom0");
+ if (error != 0)
+ panic("starting laundry for domain 0, error %d", error);
#ifdef VM_NUMA_ALLOC
for (i = 1; i < vm_ndomains; i++) {
error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,

File Metadata

Mime Type
text/plain
Expires
Sat, Jan 17, 7:37 PM (6 h, 14 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27698111
Default Alt Text
D8302.diff (49 KB)

Event Timeline