Changeset View
Standalone View
sys/vm/vm_pageout.c
Context not available. | |||||
/* the kernel process "vm_pageout"*/ | /* the kernel process "vm_pageout"*/ | ||||
static void vm_pageout(void); | static void vm_pageout(void); | ||||
static void vm_pageout_init(void); | static void vm_pageout_init(void); | ||||
static int vm_pageout_clean(vm_page_t m); | static int vm_pageout_clean(vm_page_t m, int *numpagedout); | ||||
static int vm_pageout_cluster(vm_page_t m); | static int vm_pageout_cluster(vm_page_t m); | ||||
static bool vm_pageout_scan(struct vm_domain *vmd, int pass); | static bool vm_pageout_scan(struct vm_domain *vmd, int pass); | ||||
static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, | static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, | ||||
Context not available. | |||||
SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); | SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); | ||||
#endif | #endif | ||||
/* Pagedaemon activity rates, in subdivisions of one second. */ | |||||
#define VM_LAUNDER_RATE 10 | |||||
#define VM_INACT_SCAN_RATE 2 | |||||
int vm_pageout_deficit; /* Estimated number of pages deficit */ | int vm_pageout_deficit; /* Estimated number of pages deficit */ | ||||
u_int vm_pageout_wakeup_thresh; | u_int vm_pageout_wakeup_thresh; | ||||
Context not available. | |||||
bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ | bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ | ||||
bool vm_pages_needed; /* Are threads waiting for free pages? */ | bool vm_pages_needed; /* Are threads waiting for free pages? */ | ||||
/* Pending request for dirty page laundering. */ | |||||
static enum { | |||||
VM_LAUNDRY_IDLE, | |||||
VM_LAUNDRY_BACKGROUND, | |||||
VM_LAUNDRY_SHORTFALL | |||||
} vm_laundry_request = VM_LAUNDRY_IDLE; | |||||
#if !defined(NO_SWAPPING) | #if !defined(NO_SWAPPING) | ||||
static int vm_pageout_req_swapout; /* XXX */ | static int vm_pageout_req_swapout; /* XXX */ | ||||
static int vm_daemon_needed; | static int vm_daemon_needed; | ||||
Context not available. | |||||
/* Allow for use by vm_pageout before vm_daemon is initialized. */ | /* Allow for use by vm_pageout before vm_daemon is initialized. */ | ||||
MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); | MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); | ||||
#endif | #endif | ||||
static int vm_max_launder = 32; | |||||
static int vm_pageout_update_period; | static int vm_pageout_update_period; | ||||
static int defer_swap_pageouts; | |||||
static int disable_swap_pageouts; | static int disable_swap_pageouts; | ||||
static int lowmem_period = 10; | static int lowmem_period = 10; | ||||
static time_t lowmem_uptime; | static time_t lowmem_uptime; | ||||
Context not available. | |||||
CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, | CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, | ||||
"free page threshold for waking up the pageout daemon"); | "free page threshold for waking up the pageout daemon"); | ||||
SYSCTL_INT(_vm, OID_AUTO, max_launder, | |||||
CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); | |||||
SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, | SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, | ||||
CTLFLAG_RW, &vm_pageout_update_period, 0, | CTLFLAG_RW, &vm_pageout_update_period, 0, | ||||
"Maximum active LRU update period"); | "Maximum active LRU update period"); | ||||
Context not available. | |||||
CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); | CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); | ||||
#endif | #endif | ||||
SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, | |||||
CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); | |||||
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, | SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, | ||||
CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); | CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); | ||||
Context not available. | |||||
CTLFLAG_RW, &vm_pageout_oom_seq, 0, | CTLFLAG_RW, &vm_pageout_oom_seq, 0, | ||||
"back-to-back calls to oom detector to start OOM"); | "back-to-back calls to oom detector to start OOM"); | ||||
static int act_scan_laundry_weight = 3; | |||||
SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW, | |||||
&act_scan_laundry_weight, 0, | |||||
"weight given to clean vs. dirty pages in active queue scans"); | |||||
static u_int vm_background_launder_target; | |||||
SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW, | |||||
alc: Kostik, what does Bruce say about proper SYSCTL style? Is the CTLFLAG_RW supposed to be on the… | |||||
Not Done Inline ActionsI am only sure about the 4-spaces indent and the new line before description. I used line break after the CTLFLAG* flags, but indeed Bruce might said that line break should be used before. kib: I am only sure about the 4-spaces indent and the new line before description. I used line… | |||||
&vm_background_launder_target, 0, | |||||
"background laundering target, in pages"); | |||||
static u_int vm_background_launder_rate = 4096; | |||||
SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW, | |||||
&vm_background_launder_rate, 0, | |||||
"background laundering rate, in kilobytes per second"); | |||||
static u_int vm_background_launder_max = 20 * 1024; | |||||
SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW, | |||||
&vm_background_launder_max, 0, "background laundering cap, in kilobytes"); | |||||
#define VM_PAGEOUT_PAGE_COUNT 16 | #define VM_PAGEOUT_PAGE_COUNT 16 | ||||
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; | int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; | ||||
Context not available. | |||||
SYSCTL_INT(_vm, OID_AUTO, max_wired, | SYSCTL_INT(_vm, OID_AUTO, max_wired, | ||||
CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); | CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); | ||||
static u_int isqrt(u_int num); | |||||
static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); | static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); | ||||
static int vm_pageout_launder(struct vm_domain *vmd, int launder, | |||||
bool in_shortfall); | |||||
static void vm_pageout_laundry_worker(void *arg); | |||||
#if !defined(NO_SWAPPING) | #if !defined(NO_SWAPPING) | ||||
static void vm_pageout_map_deactivate_pages(vm_map_t, long); | static void vm_pageout_map_deactivate_pages(vm_map_t, long); | ||||
static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); | static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); | ||||
Context not available. | |||||
/* | /* | ||||
* We can cluster only if the page is not clean, busy, or held, and | * We can cluster only if the page is not clean, busy, or held, and | ||||
* the page is inactive. | * the page is in the laundry queue. | ||||
* | * | ||||
* During heavy mmap/modification loads the pageout | * During heavy mmap/modification loads the pageout | ||||
* daemon can really fragment the underlying file | * daemon can really fragment the underlying file | ||||
Context not available. | |||||
break; | break; | ||||
} | } | ||||
vm_page_lock(p); | vm_page_lock(p); | ||||
if (p->queue != PQ_INACTIVE || | if (!vm_page_in_laundry(p) || | ||||
p->hold_count != 0) { /* may be undergoing I/O */ | p->hold_count != 0) { /* may be undergoing I/O */ | ||||
vm_page_unlock(p); | vm_page_unlock(p); | ||||
ib = 0; | ib = 0; | ||||
Context not available. | |||||
if (p->dirty == 0) | if (p->dirty == 0) | ||||
break; | break; | ||||
vm_page_lock(p); | vm_page_lock(p); | ||||
if (p->queue != PQ_INACTIVE || | if (!vm_page_in_laundry(p) || | ||||
p->hold_count != 0) { /* may be undergoing I/O */ | p->hold_count != 0) { /* may be undergoing I/O */ | ||||
vm_page_unlock(p); | vm_page_unlock(p); | ||||
break; | break; | ||||
Done Inline ActionsWhat code is supposed to re-queue the page after flush ? I see that the patch added a call to vm_page_deactivate_noreuse() for write completion in the swap pager, but I failed to find something that would re-queue pages for the local vnode pager. kib: What code is supposed to re-queue the page after flush ? I see that the patch added a call to… | |||||
Not Done Inline ActionsI think this is a valid problem in that there is no general mechanism to requeue the pages. The buffer cache takes care of this for some filesystems it seems, but I don't see how it would work for ZFS. markj: I think this is a valid problem in that there is no general mechanism to requeue the pages. The… | |||||
Not Done Inline ActionsThank you for pointing out the buffer cache involvement there, indeed, for filesystems using buffer cache it happens automatically. But IMO relying of the pager behaviour is wrong, and ZFS should not know such peculiarities of VM. I would expect that e.g. completion code for vm_pageout_flush() take care of the re-queue, esp. because it already handles it for some error cases. kib: Thank you for pointing out the buffer cache involvement there, indeed, for filesystems using… | |||||
Not Done Inline ActionsEven for filesystems which use the buffer cache, I don't see anything that automatically unwires the buf's pages once the write completes. vnode_pager_generic_putpages() specifies IO_VMIO, which is translated to B_RELBUF by ext2 and UFS, but it seems that this should really be enforced by generic code. Unfortunately, it doesn't seem possible for vm_pageout_flush() or even the vnode pager to specify a completion handler - VOP_PUTPAGES provides no mechanism to do so. Am I missing something? markj: Even for filesystems which use the buffer cache, I don't see anything that automatically… | |||||
Not Done Inline ActionsWe do not need to force unwire. It is enough for pages to be queued laterm when the buffer is recycled. Buffer cache size is limited to the fixed amount, so the count of pages participating in the VMIO buffers and not visible to the page daemon is limited. OTOH, pages that not queued because they were missed are effectively unswappable until the owning object is destroyed. VOP_PUTPAGES() is synchronous, more, the typical operation of the vnode pager marks the page clean before the buffer write is initiated. It is, so to say, migrate the dirtyness from the pages to buffer. I mean that vm_pageout_flush() could re-queue the pages after the pager returned. kib: We do not need to force unwire. It is enough for pages to be queued laterm when the buffer is… | |||||
Not Done Inline ActionsForcing an unwire is not strictly necessary, but in the case of laundering, the pages have gone through LRU and are eligible for reclamation. It seems strange to let them exert pressure on the bufspace and go through the buffer cache's own LRU. I think UFS' current behaviour of specifying B_RELBUF is correct. VOP_PUTPAGES is not synchronous by default - one needs to specify VM_PAGER_PUT_SYNC, and this is only done when v_free_count < v_pageout_free_min. Even if VOP_PUTPAGES allowed one to specify an iodone handler like VOP_GETPAGES_ASYNC does, I don't see a way to implement it as a generic vnode method. vop_stdputpages() currently just calls VOP_WRITE, which also doesn't provide notifications for async writes. markj: Forcing an unwire is not strictly necessary, but in the case of laundering, the pages have gone… | |||||
Not Done Inline ActionsIf you want to modify the bufcache behaviour WRT unwiring of the laundered pages, then vfs_vmio_unwire() looks like a proper place. It already tried to free pages or affect their LRU position on unwire in several cases, so one more case is not too outstanding. I tried to express that VOP_PUTPAGES() is synchronous from the VM PoV: the page is marked clean outright, even before the write is scheduled somehow in the io subsystem. It is io level which records the need of performing write, and e.g. for clustering allowed (async putpages in VM terms), the dirty buffer may sit on the dirty queue until buffer or syncer daemons care about it. But from the VM look, the page is clean after the successful return from vm_pager_putpages(), and sometimes even earlier. So vm_pageout_fault() can do whatever re-queuing attempts it finds suitable, after the pager call. kib: If you want to modify the bufcache behaviour WRT unwiring of the laundered pages, then… | |||||
Not Done Inline ActionsDo we use VM_PAGER_PEND anywhere besides the swap pager? To Kostik's point, I think that I agree. We should remove the vm_page_dequeue() calls from vm_pageout_cluster() and instead call vm_page_deactivate_noreuse() in vm_pageout_flush() when the pager returns VM_PAGER_OK. However, there is one catch. We shouldn't automatically call vm_page_deactivate_noreuse() when vm_pageout_flush() is called by msync(), or in general any caller besides the laundry thread. I think it would suffice to test whether the page is in the laundry queue, and only call vm_page_deactivate_noreuse() if it is. That way, we would also handle the case where msync() is performed on a page in the laundry queue. Turning to Mark's point, we ought to tell the buffer cache to immediately release the buffer and perform vm_page_deactivate_noreuse() on the pages. However, I don't think that any of the existing flags that vm_pageout_flush() can pass to vm_pager_put_pages() accomplishes the latter. Am I wrong? alc: Do we use VM_PAGER_PEND anywhere besides the swap pager?
To Kostik's point, I think that I… | |||||
Not Done Inline ActionsOk, I understand the suggestion now. I think that queuing the page using vm_page_deactivate_noreuse() if it was on the laundry queue is a reasonable policy, and we can use the B_NOREUSE flag to effect this in the buffer cache. It does indeed seem like we need to add a new VM_PAGER_PUT_* flag to signal our intent to VOP_PUTPAGES, and it also needs to be plumbed through VOP_WRITE somehow for the generic PUTPAGES implementation. If we add a new VM_PAGER_PUT_* flag, then we actually don't need to test whether the page is in the laundry queue: vm_pageout_flush() takes the pager flags as a parameter, so we can just set the flag in vm_pageout_cluster() and use that to determine where to queue. That way, msync and so on will be unaffected. VM_PAGER_PEND only appears to be set in the swap pager. markj: Ok, I understand the suggestion now. I think that queuing the page using… | |||||
Not Done Inline Actions
We might have a page in the laundry queue that is actually laundered by the msync(2) call. In that case, we would want the page to be moved to the inactive queue. If the page has been referenced while in the laundry queue, there shouldn't be a problem with having used vm_page_deactivate_noreuse() on the page because vm_pageout_scan() will see the reference and not reclaim the page. alc: > That way, msync and so on will be unaffected.
We might have a page in the laundry queue that… | |||||
Not Done Inline ActionsHere is the proposed patch: Index: vm/vm_pageout.c =================================================================== --- vm/vm_pageout.c (revision 307753) +++ vm/vm_pageout.c (working copy) @@ -405,7 +405,6 @@ vm_pageout_cluster(vm_page_t m) */ vm_page_assert_unbusied(m); KASSERT(m->hold_count == 0, ("page %p is held", m)); - vm_page_dequeue(m); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; @@ -448,7 +447,6 @@ more: ib = 0; break; } - vm_page_dequeue(p); vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; @@ -474,7 +472,6 @@ more: vm_page_unlock(p); break; } - vm_page_dequeue(p); vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; @@ -550,6 +547,10 @@ vm_pageout_flush(vm_page_t *mc, int count, int fla ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); case VM_PAGER_PEND: numpagedout++; break; alc: Here is the proposed patch:
```
Index: vm/vm_pageout.c… | |||||
Not Done Inline ActionsLooks fine. kib: Looks fine. | |||||
Not Done Inline ActionsSeems right to me. I can work on the corresponding buffer cache change, but that's probably not a prerequisite to merging PQ_LAUNDRY? markj: Seems right to me. I can work on the corresponding buffer cache change, but that's probably not… | |||||
Not Done Inline ActionsNo, I don't think it's a prerequisite. alc: No, I don't think it's a prerequisite. | |||||
Context not available. | |||||
("vm_pageout_flush: page %p is not write protected", mt)); | ("vm_pageout_flush: page %p is not write protected", mt)); | ||||
switch (pageout_status[i]) { | switch (pageout_status[i]) { | ||||
case VM_PAGER_OK: | case VM_PAGER_OK: | ||||
vm_page_lock(mt); | |||||
if (vm_page_in_laundry(mt)) | |||||
vm_page_deactivate_noreuse(mt); | |||||
vm_page_unlock(mt); | |||||
/* FALLTHROUGH */ | |||||
case VM_PAGER_PEND: | case VM_PAGER_PEND: | ||||
numpagedout++; | numpagedout++; | ||||
break; | break; | ||||
Context not available. | |||||
* worked. | * worked. | ||||
*/ | */ | ||||
vm_page_undirty(mt); | vm_page_undirty(mt); | ||||
vm_page_lock(mt); | |||||
vm_page_deactivate(mt); | |||||
Done Inline ActionsI don't think that we ever consciously chose between vm_page_deactivate() and vm_page_deactivate_noreuse() here. Using vm_page_deactivate() will preserve the contents of this "failed page" from reclamation for a little longer. Is there actually a reason to prefer that? alc: I don't think that we ever consciously chose between vm_page_deactivate() and… | |||||
Not Done Inline ActionsI can't see any good reason either way. It looks like the failure modes that lead to VM_PAGER_BAD are transient (e.g. vnode is being reclaimed) and will lead to the page being freed by another mechanism. markj: I can't see any good reason either way. It looks like the failure modes that lead to… | |||||
Done Inline ActionsAfter sleeping on the question and your response, I have a slight preference for using the _noreuse option. I'm also going to condition the _noreuse call on whether the page is in the laundry, like we did for the OK and PEND cases. This way, msync() pages will remain in their current queue, unless they were in the laundry. alc: After sleeping on the question and your response, I have a slight preference for using the… | |||||
vm_page_unlock(mt); | |||||
break; | break; | ||||
case VM_PAGER_ERROR: | case VM_PAGER_ERROR: | ||||
case VM_PAGER_FAIL: | case VM_PAGER_FAIL: | ||||
/* | /* | ||||
* If page couldn't be paged out, then reactivate the | * If the page couldn't be paged out, then reactivate | ||||
* page so it doesn't clog the inactive list. (We | * it so that it doesn't clog the laundry and inactive | ||||
* will try paging out it again later). | * queues. (We will try paging it out again later). | ||||
*/ | */ | ||||
vm_page_lock(mt); | vm_page_lock(mt); | ||||
vm_page_activate(mt); | vm_page_activate(mt); | ||||
Context not available. | |||||
act_delta = 1; | act_delta = 1; | ||||
vm_page_aflag_clear(p, PGA_REFERENCED); | vm_page_aflag_clear(p, PGA_REFERENCED); | ||||
} | } | ||||
if (p->queue != PQ_ACTIVE && act_delta != 0) { | if (!vm_page_active(p) && act_delta != 0) { | ||||
vm_page_activate(p); | vm_page_activate(p); | ||||
p->act_count += act_delta; | p->act_count += act_delta; | ||||
} else if (p->queue == PQ_ACTIVE) { | } else if (vm_page_active(p)) { | ||||
if (act_delta == 0) { | if (act_delta == 0) { | ||||
p->act_count -= min(p->act_count, | p->act_count -= min(p->act_count, | ||||
ACT_DECLINE); | ACT_DECLINE); | ||||
Context not available. | |||||
p->act_count += ACT_ADVANCE; | p->act_count += ACT_ADVANCE; | ||||
vm_page_requeue(p); | vm_page_requeue(p); | ||||
} | } | ||||
} else if (p->queue == PQ_INACTIVE) | } else if (vm_page_inactive(p)) | ||||
pmap_remove_all(p); | pmap_remove_all(p); | ||||
vm_page_unlock(p); | vm_page_unlock(p); | ||||
} | } | ||||
Context not available. | |||||
* Returns 0 on success and an errno otherwise. | * Returns 0 on success and an errno otherwise. | ||||
*/ | */ | ||||
static int | static int | ||||
vm_pageout_clean(vm_page_t m) | vm_pageout_clean(vm_page_t m, int *numpagedout) | ||||
{ | { | ||||
struct vnode *vp; | struct vnode *vp; | ||||
struct mount *mp; | struct mount *mp; | ||||
Context not available. | |||||
* (3) reallocated to a different offset, or | * (3) reallocated to a different offset, or | ||||
* (4) cleaned. | * (4) cleaned. | ||||
*/ | */ | ||||
if (m->queue != PQ_INACTIVE || m->object != object || | if (!vm_page_in_laundry(m) || m->object != object || | ||||
m->pindex != pindex || m->dirty == 0) { | m->pindex != pindex || m->dirty == 0) { | ||||
vm_page_unlock(m); | vm_page_unlock(m); | ||||
error = ENXIO; | error = ENXIO; | ||||
Context not available. | |||||
* laundry. If it is still in the laundry, then we | * laundry. If it is still in the laundry, then we | ||||
* start the cleaning operation. | * start the cleaning operation. | ||||
*/ | */ | ||||
if (vm_pageout_cluster(m) == 0) | if ((*numpagedout = vm_pageout_cluster(m)) == 0) | ||||
error = EIO; | error = EIO; | ||||
unlock_all: | unlock_all: | ||||
Context not available. | |||||
} | } | ||||
/* | /* | ||||
* Attempt to launder the specified number of pages. | |||||
* | |||||
* Returns the number of pages successfully laundered. | |||||
*/ | |||||
static int | |||||
vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) | |||||
{ | |||||
struct vm_pagequeue *pq; | |||||
vm_object_t object; | |||||
vm_page_t m, next; | |||||
int act_delta, error, maxscan, numpagedout, starting_target; | |||||
int vnodes_skipped; | |||||
bool pageout_ok, queue_locked; | |||||
starting_target = launder; | |||||
vnodes_skipped = 0; | |||||
/* | |||||
* Scan the laundry queue for pages eligible to be laundered. We stop | |||||
* once the target number of dirty pages have been laundered, or once | |||||
* we've reached the end of the queue. A single iteration of this loop | |||||
* may cause more than one page to be laundered because of clustering. | |||||
* | |||||
* maxscan ensures that we don't re-examine requeued pages. Any | |||||
* additional pages written as part of a cluster are subtracted from | |||||
* maxscan since they must be taken from the laundry queue. | |||||
*/ | |||||
pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; | |||||
maxscan = pq->pq_cnt; | |||||
vm_pagequeue_lock(pq); | |||||
queue_locked = true; | |||||
for (m = TAILQ_FIRST(&pq->pq_pl); | |||||
m != NULL && maxscan-- > 0 && launder > 0; | |||||
m = next) { | |||||
vm_pagequeue_assert_locked(pq); | |||||
KASSERT(queue_locked, ("unlocked laundry queue")); | |||||
KASSERT(vm_page_in_laundry(m), | |||||
("page %p has an inconsistent queue", m)); | |||||
next = TAILQ_NEXT(m, plinks.q); | |||||
if ((m->flags & PG_MARKER) != 0) | |||||
continue; | |||||
KASSERT((m->flags & PG_FICTITIOUS) == 0, | |||||
("PG_FICTITIOUS page %p cannot be in laundry queue", m)); | |||||
KASSERT((m->oflags & VPO_UNMANAGED) == 0, | |||||
("VPO_UNMANAGED page %p cannot be in laundry queue", m)); | |||||
if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { | |||||
vm_page_unlock(m); | |||||
continue; | |||||
} | |||||
object = m->object; | |||||
if ((!VM_OBJECT_TRYWLOCK(object) && | |||||
(!vm_pageout_fallback_object_lock(m, &next) || | |||||
m->hold_count != 0)) || vm_page_busied(m)) { | |||||
VM_OBJECT_WUNLOCK(object); | |||||
vm_page_unlock(m); | |||||
continue; | |||||
} | |||||
/* | |||||
* Unlock the laundry queue, invalidating the 'next' pointer. | |||||
* Use a marker to remember our place in the laundry queue. | |||||
*/ | |||||
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, | |||||
plinks.q); | |||||
vm_pagequeue_unlock(pq); | |||||
queue_locked = false; | |||||
/* | |||||
* Invalid pages can be easily freed. They cannot be | |||||
* mapped; vm_page_free() asserts this. | |||||
*/ | |||||
if (m->valid == 0) | |||||
goto free_page; | |||||
/* | |||||
* If the page has been referenced and the object is not dead, | |||||
* reactivate or requeue the page depending on whether the | |||||
* object is mapped. | |||||
*/ | |||||
if ((m->aflags & PGA_REFERENCED) != 0) { | |||||
vm_page_aflag_clear(m, PGA_REFERENCED); | |||||
act_delta = 1; | |||||
} else | |||||
act_delta = 0; | |||||
if (object->ref_count != 0) | |||||
act_delta += pmap_ts_referenced(m); | |||||
else { | |||||
KASSERT(!pmap_page_is_mapped(m), | |||||
("page %p is mapped", m)); | |||||
} | |||||
if (act_delta != 0) { | |||||
if (object->ref_count != 0) { | |||||
PCPU_INC(cnt.v_reactivated); | |||||
vm_page_activate(m); | |||||
/* | |||||
* Increase the activation count if the page | |||||
* was referenced while in the laundry queue. | |||||
* This makes it less likely that the page will | |||||
* be returned prematurely to the inactive | |||||
* queue. | |||||
*/ | |||||
m->act_count += act_delta + ACT_ADVANCE; | |||||
/* | |||||
* If this was a background laundering, count | |||||
* activated pages towards our target. The | |||||
* purpose of background laundering is to ensure | |||||
* that pages are eventually cycled through the | |||||
* laundry queue, and an activation is a valid | |||||
* way out. | |||||
*/ | |||||
if (!in_shortfall) | |||||
launder--; | |||||
goto drop_page; | |||||
} else if ((object->flags & OBJ_DEAD) == 0) | |||||
goto requeue_page; | |||||
} | |||||
/* | |||||
* If the page appears to be clean at the machine-independent | |||||
* layer, then remove all of its mappings from the pmap in | |||||
* anticipation of freeing it. If, however, any of the page's | |||||
* mappings allow write access, then the page may still be | |||||
* modified until the last of those mappings are removed. | |||||
*/ | |||||
if (object->ref_count != 0) { | |||||
vm_page_test_dirty(m); | |||||
if (m->dirty == 0) | |||||
pmap_remove_all(m); | |||||
} | |||||
/* | |||||
* Clean pages are freed, and dirty pages are paged out unless | |||||
* they belong to a dead object. Requeueing dirty pages from | |||||
* dead objects is pointless, as they are being paged out and | |||||
* freed by the thread that destroyed the object. | |||||
*/ | |||||
if (m->dirty == 0) { | |||||
free_page: | |||||
vm_page_free(m); | |||||
PCPU_INC(cnt.v_dfree); | |||||
} else if ((object->flags & OBJ_DEAD) == 0) { | |||||
if (object->type != OBJT_SWAP && | |||||
object->type != OBJT_DEFAULT) | |||||
pageout_ok = true; | |||||
else if (disable_swap_pageouts) | |||||
pageout_ok = false; | |||||
else | |||||
pageout_ok = true; | |||||
if (!pageout_ok) { | |||||
requeue_page: | |||||
vm_pagequeue_lock(pq); | |||||
queue_locked = true; | |||||
vm_page_requeue_locked(m); | |||||
goto drop_page; | |||||
} | |||||
/* | |||||
* Form a cluster with adjacent, dirty pages from the | |||||
* same object, and page out that entire cluster. | |||||
* | |||||
* The adjacent, dirty pages must also be in the | |||||
* laundry. However, their mappings are not checked | |||||
* for new references. Consequently, a recently | |||||
* referenced page may be paged out. However, that | |||||
* page will not be prematurely reclaimed. After page | |||||
* out, the page will be placed in the inactive queue, | |||||
* where any new references will be detected and the | |||||
* page reactivated. | |||||
*/ | |||||
error = vm_pageout_clean(m, &numpagedout); | |||||
if (error == 0) { | |||||
launder -= numpagedout; | |||||
maxscan -= numpagedout - 1; | |||||
} else if (error == EDEADLK) { | |||||
pageout_lock_miss++; | |||||
vnodes_skipped++; | |||||
} | |||||
goto relock_queue; | |||||
} | |||||
drop_page: | |||||
vm_page_unlock(m); | |||||
VM_OBJECT_WUNLOCK(object); | |||||
relock_queue: | |||||
if (!queue_locked) { | |||||
vm_pagequeue_lock(pq); | |||||
queue_locked = true; | |||||
} | |||||
next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); | |||||
TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); | |||||
} | |||||
vm_pagequeue_unlock(pq); | |||||
/* | |||||
* Wakeup the sync daemon if we skipped a vnode in a writeable object | |||||
* and we didn't launder enough pages. | |||||
*/ | |||||
if (vnodes_skipped > 0 && launder > 0) | |||||
(void)speedup_syncer(); | |||||
return (starting_target - launder); | |||||
} | |||||
/* | |||||
* Compute the integer square root. | |||||
*/ | |||||
static u_int | |||||
isqrt(u_int num) | |||||
{ | |||||
u_int bit, root, tmp; | |||||
bit = 1u << ((NBBY * sizeof(u_int)) - 2); | |||||
while (bit > num) | |||||
bit >>= 2; | |||||
root = 0; | |||||
while (bit != 0) { | |||||
tmp = root + bit; | |||||
root >>= 1; | |||||
if (num >= tmp) { | |||||
num -= tmp; | |||||
root += bit; | |||||
} | |||||
bit >>= 2; | |||||
} | |||||
return (root); | |||||
} | |||||
/* | |||||
* Perform the work of the laundry thread: periodically wake up and determine | |||||
* whether any pages need to be laundered. If so, determine the number of pages | |||||
* that need to be laundered, and launder them. | |||||
*/ | |||||
static void | |||||
vm_pageout_laundry_worker(void *arg) | |||||
{ | |||||
struct vm_domain *domain; | |||||
struct vm_pagequeue *pq; | |||||
uint64_t nclean, ndirty; | |||||
u_int last_launder, wakeups; | |||||
int domidx, last_target, launder, shortfall, shortfall_cycle, target; | |||||
bool in_shortfall; | |||||
domidx = (uintptr_t)arg; | |||||
domain = &vm_dom[domidx]; | |||||
pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; | |||||
KASSERT(domain->vmd_segs != 0, ("domain without segments")); | |||||
vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); | |||||
shortfall = 0; | |||||
in_shortfall = false; | |||||
shortfall_cycle = 0; | |||||
target = 0; | |||||
last_launder = 0; | |||||
/* | |||||
* The pageout laundry worker is never done, so loop forever. | |||||
*/ | |||||
for (;;) { | |||||
KASSERT(target >= 0, ("negative target %d", target)); | |||||
KASSERT(shortfall_cycle >= 0, | |||||
("negative cycle %d", shortfall_cycle)); | |||||
launder = 0; | |||||
wakeups = VM_METER_PCPU_CNT(v_pdwakeups); | |||||
/* | |||||
* First determine whether we need to launder pages to meet a | |||||
* shortage of free pages. | |||||
*/ | |||||
if (shortfall > 0) { | |||||
in_shortfall = true; | |||||
shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; | |||||
target = shortfall; | |||||
} else if (!in_shortfall) | |||||
goto trybackground; | |||||
else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { | |||||
/* | |||||
* We recently entered shortfall and began laundering | |||||
* pages. If we have completed that laundering run | |||||
* (and we are no longer in shortfall) or we have met | |||||
* our laundry target through other activity, then we | |||||
* can stop laundering pages. | |||||
*/ | |||||
in_shortfall = false; | |||||
target = 0; | |||||
goto trybackground; | |||||
} | |||||
last_launder = wakeups; | |||||
launder = target / shortfall_cycle--; | |||||
goto dolaundry; | |||||
/* | |||||
* There's no immediate need to launder any pages; see if we | |||||
* meet the conditions to perform background laundering: | |||||
* | |||||
* 1. The ratio of dirty to clean inactive pages exceeds the | |||||
* background laundering threshold and the pagedaemon has | |||||
* been woken up to reclaim pages since our last | |||||
* laundering, or | |||||
* 2. we haven't yet reached the target of the current | |||||
* background laundering run. | |||||
* | |||||
* The background laundering threshold is not a constant. | |||||
* Instead, it is a slowly growing function of the number of | |||||
* page daemon wakeups since the last laundering. Thus, as the | |||||
* ratio of dirty to clean inactive pages grows, the amount of | |||||
* memory pressure required to trigger laundering decreases. | |||||
*/ | |||||
trybackground: | |||||
nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; | |||||
ndirty = vm_cnt.v_laundry_count; | |||||
if (target == 0 && wakeups != last_launder && | |||||
ndirty * isqrt(wakeups - last_launder) >= nclean) { | |||||
target = vm_background_launder_target; | |||||
} | |||||
/* | |||||
* We have a non-zero background laundering target. If we've | |||||
* laundered up to our maximum without observing a page daemon | |||||
* wakeup, just stop. This is a safety belt that ensures we | |||||
* don't launder an excessive amount if memory pressure is low | |||||
* and the ratio of dirty to clean pages is large. Otherwise, | |||||
* proceed at the background laundering rate. | |||||
*/ | |||||
if (target > 0) { | |||||
if (wakeups != last_launder) { | |||||
last_launder = wakeups; | |||||
last_target = target; | |||||
} else if (last_target - target >= | |||||
vm_background_launder_max * PAGE_SIZE / 1024) { | |||||
target = 0; | |||||
} | |||||
launder = vm_background_launder_rate * PAGE_SIZE / 1024; | |||||
launder /= VM_LAUNDER_RATE; | |||||
if (launder > target) | |||||
launder = target; | |||||
} | |||||
dolaundry: | |||||
if (launder > 0) { | |||||
/* | |||||
* Because of I/O clustering, the number of laundered | |||||
* pages could exceed "target" by the maximum size of | |||||
* a cluster minus one. | |||||
*/ | |||||
target -= min(vm_pageout_launder(domain, launder, | |||||
in_shortfall), target); | |||||
pause("laundp", hz / VM_LAUNDER_RATE); | |||||
} | |||||
/* | |||||
* If we're not currently laundering pages and the page daemon | |||||
* hasn't posted a new request, sleep until the page daemon | |||||
* kicks us. | |||||
*/ | |||||
vm_pagequeue_lock(pq); | |||||
if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) | |||||
(void)mtx_sleep(&vm_laundry_request, | |||||
vm_pagequeue_lockptr(pq), PVM, "launds", 0); | |||||
/* | |||||
* If the pagedaemon has indicated that it's in shortfall, start | |||||
* a shortfall laundering unless we're already in the middle of | |||||
* one. This may preempt a background laundering. | |||||
*/ | |||||
if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && | |||||
(!in_shortfall || shortfall_cycle == 0)) { | |||||
shortfall = vm_laundry_target() + vm_pageout_deficit; | |||||
target = 0; | |||||
} else | |||||
shortfall = 0; | |||||
if (target == 0) | |||||
vm_laundry_request = VM_LAUNDRY_IDLE; | |||||
vm_pagequeue_unlock(pq); | |||||
} | |||||
} | |||||
/* | |||||
* vm_pageout_scan does the dirty work for the pageout daemon. | * vm_pageout_scan does the dirty work for the pageout daemon. | ||||
* | * | ||||
* pass 0 - Update active LRU/deactivate pages | * pass == 0: Update active LRU/deactivate pages | ||||
* pass 1 - Free inactive pages | * pass >= 1: Free inactive pages | ||||
* pass 2 - Launder dirty pages | |||||
* | * | ||||
* Returns true if pass was zero or enough pages were freed by the inactive | * Returns true if pass was zero or enough pages were freed by the inactive | ||||
* queue scan to meet the target. | * queue scan to meet the target. | ||||
Context not available. | |||||
vm_pageout_scan(struct vm_domain *vmd, int pass) | vm_pageout_scan(struct vm_domain *vmd, int pass) | ||||
{ | { | ||||
vm_page_t m, next; | vm_page_t m, next; | ||||
struct vm_pagequeue *pq; | struct vm_pagequeue *pq, *laundryq; | ||||
vm_object_t object; | vm_object_t object; | ||||
long min_scan; | long min_scan; | ||||
int act_delta, addl_page_shortage, deficit, error, inactq_shortage; | int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; | ||||
int maxlaunder, maxscan, page_shortage, scan_tick, scanned; | int page_shortage, scan_tick, scanned, starting_page_shortage; | ||||
int starting_page_shortage, vnodes_skipped; | boolean_t queue_locked; | ||||
boolean_t pageout_ok, queue_locked; | |||||
/* | /* | ||||
* If we need to reclaim memory ask kernel caches to return | * If we need to reclaim memory ask kernel caches to return | ||||
Context not available. | |||||
starting_page_shortage = page_shortage; | starting_page_shortage = page_shortage; | ||||
/* | /* | ||||
* maxlaunder limits the number of dirty pages we flush per scan. | |||||
* For most systems a smaller value (16 or 32) is more robust under | |||||
* extreme memory and disk pressure because any unnecessary writes | |||||
* to disk can result in extreme performance degredation. However, | |||||
* systems with excessive dirty pages (especially when MAP_NOSYNC is | |||||
* used) will die horribly with limited laundering. If the pageout | |||||
* daemon cannot clean enough pages in the first pass, we let it go | |||||
* all out in succeeding passes. | |||||
*/ | |||||
if ((maxlaunder = vm_max_launder) <= 1) | |||||
maxlaunder = 1; | |||||
if (pass > 1) | |||||
maxlaunder = 10000; | |||||
vnodes_skipped = 0; | |||||
/* | |||||
* Start scanning the inactive queue for pages that we can free. The | * Start scanning the inactive queue for pages that we can free. The | ||||
* scan will stop when we reach the target or we have scanned the | * scan will stop when we reach the target or we have scanned the | ||||
* entire queue. (Note that m->act_count is not used to make | * entire queue. (Note that m->act_count is not used to make | ||||
Context not available. | |||||
m = next) { | m = next) { | ||||
vm_pagequeue_assert_locked(pq); | vm_pagequeue_assert_locked(pq); | ||||
KASSERT(queue_locked, ("unlocked inactive queue")); | KASSERT(queue_locked, ("unlocked inactive queue")); | ||||
KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); | KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); | ||||
PCPU_INC(cnt.v_pdpages); | PCPU_INC(cnt.v_pdpages); | ||||
next = TAILQ_NEXT(m, plinks.q); | next = TAILQ_NEXT(m, plinks.q); | ||||
Context not available. | |||||
KASSERT(m->hold_count == 0, ("Held page %p", m)); | KASSERT(m->hold_count == 0, ("Held page %p", m)); | ||||
/* | /* | ||||
* We unlock the inactive page queue, invalidating the | * Dequeue the inactive page and unlock the inactive page | ||||
* 'next' pointer. Use our marker to remember our | * queue, invalidating the 'next' pointer. Dequeueing the | ||||
* place. | * page here avoids a later reacquisition (and release) of | ||||
* the inactive page queue lock when vm_page_activate(), | |||||
* vm_page_free(), or vm_page_launder() is called. Use a | |||||
* marker to remember our place in the inactive queue. | |||||
*/ | */ | ||||
TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); | TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); | ||||
vm_page_dequeue_locked(m); | |||||
vm_pagequeue_unlock(pq); | vm_pagequeue_unlock(pq); | ||||
queue_locked = FALSE; | queue_locked = FALSE; | ||||
Context not available. | |||||
} | } | ||||
if (act_delta != 0) { | if (act_delta != 0) { | ||||
if (object->ref_count != 0) { | if (object->ref_count != 0) { | ||||
PCPU_INC(cnt.v_reactivated); | |||||
vm_page_activate(m); | vm_page_activate(m); | ||||
/* | /* | ||||
Context not available. | |||||
*/ | */ | ||||
m->act_count += act_delta + ACT_ADVANCE; | m->act_count += act_delta + ACT_ADVANCE; | ||||
goto drop_page; | goto drop_page; | ||||
} else if ((object->flags & OBJ_DEAD) == 0) | } else if ((object->flags & OBJ_DEAD) == 0) { | ||||
goto requeue_page; | vm_pagequeue_lock(pq); | ||||
queue_locked = TRUE; | |||||
m->queue = PQ_INACTIVE; | |||||
TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); | |||||
vm_pagequeue_cnt_inc(pq); | |||||
goto drop_page; | |||||
} | |||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
pmap_remove_all(m); | pmap_remove_all(m); | ||||
} | } | ||||
/* | |||||
* Clean pages can be freed, but dirty pages must be sent back | |||||
* to the laundry, unless they belong to a dead object. | |||||
* Requeueing dirty pages from dead objects is pointless, as | |||||
* they are being paged out and freed by the thread that | |||||
* destroyed the object. | |||||
*/ | |||||
if (m->dirty == 0) { | if (m->dirty == 0) { | ||||
/* | |||||
* Clean pages can be freed. | |||||
*/ | |||||
free_page: | free_page: | ||||
vm_page_free(m); | vm_page_free(m); | ||||
PCPU_INC(cnt.v_dfree); | PCPU_INC(cnt.v_dfree); | ||||
--page_shortage; | --page_shortage; | ||||
} else if ((object->flags & OBJ_DEAD) != 0) { | } else if ((object->flags & OBJ_DEAD) == 0) | ||||
/* | vm_page_launder(m); | ||||
* Leave dirty pages from dead objects at the front of | |||||
* the queue. They are being paged out and freed by | |||||
* the thread that destroyed the object. They will | |||||
* leave the queue shortly after the scan finishes, so | |||||
* they should be discounted from the inactive count. | |||||
*/ | |||||
addl_page_shortage++; | |||||
} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { | |||||
/* | |||||
* Dirty pages need to be paged out, but flushing | |||||
* a page is extremely expensive versus freeing | |||||
* a clean page. Rather then artificially limiting | |||||
* the number of pages we can flush, we instead give | |||||
* dirty pages extra priority on the inactive queue | |||||
* by forcing them to be cycled through the queue | |||||
* twice before being flushed, after which the | |||||
* (now clean) page will cycle through once more | |||||
* before being freed. This significantly extends | |||||
* the thrash point for a heavily loaded machine. | |||||
*/ | |||||
m->flags |= PG_WINATCFLS; | |||||
requeue_page: | |||||
vm_pagequeue_lock(pq); | |||||
queue_locked = TRUE; | |||||
vm_page_requeue_locked(m); | |||||
} else if (maxlaunder > 0) { | |||||
/* | |||||
* We always want to try to flush some dirty pages if | |||||
* we encounter them, to keep the system stable. | |||||
* Normally this number is small, but under extreme | |||||
* pressure where there are insufficient clean pages | |||||
* on the inactive queue, we may have to go all out. | |||||
*/ | |||||
if (object->type != OBJT_SWAP && | |||||
object->type != OBJT_DEFAULT) | |||||
pageout_ok = TRUE; | |||||
else if (disable_swap_pageouts) | |||||
pageout_ok = FALSE; | |||||
else if (defer_swap_pageouts) | |||||
pageout_ok = vm_page_count_min(); | |||||
else | |||||
pageout_ok = TRUE; | |||||
if (!pageout_ok) | |||||
goto requeue_page; | |||||
error = vm_pageout_clean(m); | |||||
/* | |||||
* Decrement page_shortage on success to account for | |||||
* the (future) cleaned page. Otherwise we could wind | |||||
* up laundering or cleaning too many pages. | |||||
*/ | |||||
if (error == 0) { | |||||
page_shortage--; | |||||
maxlaunder--; | |||||
} else if (error == EDEADLK) { | |||||
pageout_lock_miss++; | |||||
vnodes_skipped++; | |||||
} else if (error == EBUSY) { | |||||
addl_page_shortage++; | |||||
} | |||||
vm_page_lock_assert(m, MA_NOTOWNED); | |||||
goto relock_queue; | |||||
} | |||||
drop_page: | drop_page: | ||||
vm_page_unlock(m); | vm_page_unlock(m); | ||||
VM_OBJECT_WUNLOCK(object); | VM_OBJECT_WUNLOCK(object); | ||||
relock_queue: | |||||
if (!queue_locked) { | if (!queue_locked) { | ||||
vm_pagequeue_lock(pq); | vm_pagequeue_lock(pq); | ||||
queue_locked = TRUE; | queue_locked = TRUE; | ||||
Context not available. | |||||
} | } | ||||
vm_pagequeue_unlock(pq); | vm_pagequeue_unlock(pq); | ||||
/* | |||||
* Wake up the laundry thread so that it can perform any needed | |||||
* laundering. If we didn't meet our target, we're in shortfall and | |||||
* need to launder more aggressively. | |||||
*/ | |||||
if (vm_laundry_request == VM_LAUNDRY_IDLE && | |||||
starting_page_shortage > 0) { | |||||
laundryq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; | |||||
vm_pagequeue_lock(laundryq); | |||||
if (page_shortage > 0) { | |||||
vm_laundry_request = VM_LAUNDRY_SHORTFALL; | |||||
PCPU_INC(cnt.v_pdshortfalls); | |||||
} else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) | |||||
vm_laundry_request = VM_LAUNDRY_BACKGROUND; | |||||
wakeup(&vm_laundry_request); | |||||
vm_pagequeue_unlock(laundryq); | |||||
} | |||||
#if !defined(NO_SWAPPING) | #if !defined(NO_SWAPPING) | ||||
/* | /* | ||||
* Wakeup the swapout daemon if we didn't free the targeted number of | * Wakeup the swapout daemon if we didn't free the targeted number of | ||||
Context not available. | |||||
#endif | #endif | ||||
/* | /* | ||||
* Wakeup the sync daemon if we skipped a vnode in a writeable object | |||||
* and we didn't free enough pages. | |||||
*/ | |||||
if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - | |||||
vm_cnt.v_free_min) | |||||
(void)speedup_syncer(); | |||||
/* | |||||
* If the inactive queue scan fails repeatedly to meet its | * If the inactive queue scan fails repeatedly to meet its | ||||
* target, kill the largest process. | * target, kill the largest process. | ||||
*/ | */ | ||||
Context not available. | |||||
/* | /* | ||||
* Compute the number of pages we want to try to move from the | * Compute the number of pages we want to try to move from the | ||||
* active queue to the inactive queue. | * active queue to either the inactive or laundry queue. | ||||
* | |||||
* When scanning active pages, we make clean pages count more heavily | |||||
* towards the page shortage than dirty pages. This is because dirty | |||||
* pages must be laundered before they can be reused and thus have less | |||||
* utility when attempting to quickly alleviate a shortage. However, | |||||
* this weighting also causes the scan to deactivate dirty pages more | |||||
* more aggressively, improving the effectiveness of clustering and | |||||
* ensuring that they can eventually be reused. | |||||
*/ | */ | ||||
inactq_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + | inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + | ||||
vm_cnt.v_laundry_count / act_scan_laundry_weight) + | |||||
vm_paging_target() + deficit + addl_page_shortage; | vm_paging_target() + deficit + addl_page_shortage; | ||||
page_shortage *= act_scan_laundry_weight; | |||||
pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; | pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; | ||||
vm_pagequeue_lock(pq); | vm_pagequeue_lock(pq); | ||||
Context not available. | |||||
m->act_count -= min(m->act_count, ACT_DECLINE); | m->act_count -= min(m->act_count, ACT_DECLINE); | ||||
/* | /* | ||||
* Move this page to the tail of the active or inactive | * Move this page to the tail of the active, inactive or laundry | ||||
* queue depending on usage. | * queue depending on usage. | ||||
*/ | */ | ||||
if (m->act_count == 0) { | if (m->act_count == 0) { | ||||
/* Dequeue to avoid later lock recursion. */ | /* Dequeue to avoid later lock recursion. */ | ||||
vm_page_dequeue_locked(m); | vm_page_dequeue_locked(m); | ||||
vm_page_deactivate(m); | |||||
inactq_shortage--; | /* | ||||
* When not short for inactive pages, let dirty pages go | |||||
* through the inactive queue before moving to the | |||||
* laundry queues. This gives them some extra time to | |||||
* be reactivated, potentially avoiding an expensive | |||||
* pageout. During a page shortage, the inactive queue | |||||
* is necessarily small, so we may move dirty pages | |||||
* directly to the laundry queue. | |||||
*/ | |||||
if (inactq_shortage <= 0) | |||||
vm_page_deactivate(m); | |||||
else { | |||||
/* | |||||
* Calling vm_page_test_dirty() here would | |||||
* require acquisition of the object's write | |||||
* lock. However, during a page shortage, | |||||
* directing dirty pages into the laundry | |||||
* queue is only an optimization and not a | |||||
* requirement. Therefore, we simply rely on | |||||
* the opportunistic updates to the page's | |||||
* dirty field by the pmap. | |||||
*/ | |||||
if (m->dirty == 0) { | |||||
vm_page_deactivate(m); | |||||
inactq_shortage -= | |||||
act_scan_laundry_weight; | |||||
} else { | |||||
vm_page_launder(m); | |||||
inactq_shortage--; | |||||
} | |||||
} | |||||
} else | } else | ||||
vm_page_requeue_locked(m); | vm_page_requeue_locked(m); | ||||
vm_page_unlock(m); | vm_page_unlock(m); | ||||
Context not available. | |||||
* thread during the previous scan, which must have | * thread during the previous scan, which must have | ||||
* been a level 0 scan, or vm_pageout_wanted was | * been a level 0 scan, or vm_pageout_wanted was | ||||
* already set and the scan failed to free enough | * already set and the scan failed to free enough | ||||
* pages. If we haven't yet performed a level >= 2 | * pages. If we haven't yet performed a level >= 1 | ||||
* scan (unlimited dirty cleaning), then upgrade the | * (page reclamation) scan, then increase the level | ||||
* level and scan again now. Otherwise, sleep a bit | * and scan again now. Otherwise, sleep a bit and | ||||
* and try again later. | * try again later. | ||||
*/ | */ | ||||
mtx_unlock(&vm_page_queue_free_mtx); | mtx_unlock(&vm_page_queue_free_mtx); | ||||
if (pass > 1) | if (pass >= 1) | ||||
pause("psleep", hz / 2); | pause("psleep", hz / VM_INACT_SCAN_RATE); | ||||
pass++; | pass++; | ||||
} else { | } else { | ||||
/* | /* | ||||
Not Done Inline ActionsShould the "2" here be "VM_INACT_SCAN_INTERVAL"? alc: Should the "2" here be "VM_INACT_SCAN_INTERVAL"? | |||||
Not Done Inline ActionsYes. That name doesn't really make sense though - it's a rate. How about: diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index d09dccb..c996797 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -155,11 +155,9 @@ static struct kproc_desc vm_kp = { SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif -/* Sleep intervals for pagedaemon threads, in subdivisions of one second. */ -#define VM_LAUNDER_INTERVAL 10 -#define VM_INACT_SCAN_INTERVAL 2 - -#define VM_LAUNDER_RATE (VM_LAUNDER_INTERVAL / VM_INACT_SCAN_INTERVAL) +/* Pagedaemon activity rates, in subdivisions of one second. */ +#define VM_LAUNDER_RATE 10 +#define VM_INACT_SCAN_RATE 2 int vm_pageout_deficit; /* Estimated number of pages deficit */ u_int vm_pageout_wakeup_thresh; @@ -1149,7 +1147,7 @@ vm_pageout_laundry_worker(void *arg) */ if (shortfall > 0) { in_shortfall = true; - shortfall_cycle = VM_LAUNDER_RATE; + shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; @@ -1211,7 +1209,7 @@ trybackground: target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; - launder /= VM_LAUNDER_INTERVAL; + launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } @@ -1225,7 +1223,7 @@ dolaundry: */ target -= min(vm_pageout_launder(domain, launder, in_shortfall), target); - pause("laundp", hz / VM_LAUNDER_INTERVAL); + pause("laundp", hz / VM_LAUNDER_RATE); } /* @@ -2001,7 +1999,7 @@ vm_pageout_worker(void *arg) */ mtx_unlock(&vm_page_queue_free_mtx); if (pass >= 1) - pause("psleep", hz / 2); + pause("psleep", hz / VM_INACT_SCAN_RATE); pass++; } else { /* markj: Yes. That name doesn't really make sense though - it's a rate.
How about:
```
diff --git… | |||||
Not Done Inline ActionsI agree. Commit your proposed change. alc: I agree. Commit your proposed change. | |||||
Context not available. | |||||
/* XXX does not really belong here */ | /* XXX does not really belong here */ | ||||
if (vm_page_max_wired == 0) | if (vm_page_max_wired == 0) | ||||
vm_page_max_wired = vm_cnt.v_free_count / 3; | vm_page_max_wired = vm_cnt.v_free_count / 3; | ||||
/* | |||||
* Target amount of memory to move out of the laundry queue during a | |||||
* background laundering. This is proportional to the amount of system | |||||
* memory. | |||||
*/ | |||||
vm_background_launder_target = (vm_cnt.v_free_target - | |||||
vm_cnt.v_free_min) / 10; | |||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
#endif | #endif | ||||
swap_pager_swap_init(); | swap_pager_swap_init(); | ||||
error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, | |||||
0, 0, "laundry: dom0"); | |||||
if (error != 0) | |||||
panic("starting laundry for domain 0, error %d", error); | |||||
#ifdef VM_NUMA_ALLOC | #ifdef VM_NUMA_ALLOC | ||||
for (i = 1; i < vm_ndomains; i++) { | for (i = 1; i < vm_ndomains; i++) { | ||||
error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, | error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, | ||||
Context not available. |
Kostik, what does Bruce say about proper SYSCTL style? Is the CTLFLAG_RW supposed to be on the first line and the indentation of the continuation lines four spaces? (Currently, the new SYSCTL's are following the style of the existing ones in this file.)