Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -134,6 +134,16 @@ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward); +static int vm_pfault_oom_attempts = 3; +SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, + &vm_pfault_oom_attempts, 0, + ""); + +static int vm_pfault_oom_wait = 10; +SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, + &vm_pfault_oom_wait, 0, + ""); + static inline void release_page(struct faultstate *fs) { @@ -531,7 +541,7 @@ vm_pindex_t retry_pindex; vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; - int locked, nera, result, rv; + int locked, nera, oom, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, hardfault, is_first_object_locked; @@ -542,7 +552,9 @@ nera = -1; hardfault = false; -RetryFault:; +RetryFault: + oom = 0; +RetryFault_oom: /* * Find the backing store object and offset into it to begin the @@ -787,7 +799,17 @@ } if (fs.m == NULL) { unlock_and_deallocate(&fs); - VM_WAITPFAULT; + if (vm_pfault_oom_attempts < 0 || + oom < vm_pfault_oom_attempts) { + oom++; + vm_waitpfault(vm_pfault_oom_wait); + goto RetryFault_oom; + } + if (bootverbose) + printf( + "proc %d (%s) failed to alloc page on fault, starting OOM\n", + curproc->p_pid, curproc->p_comm); + vm_pageout_oom(VM_OOM_MEM_PF); goto RetryFault; } } Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -2652,7 +2652,7 @@ * - Called in various places before memory allocations. */ static void -_vm_wait(void) +_vm_wait(int timo) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); @@ -2663,16 +2663,16 @@ } else { if (pageproc == NULL) panic("vm_wait in early boot"); - pagedaemon_wait(PVM, "vmwait"); + pagedaemon_wait(PVM, "vmwait", timo); } } void -vm_wait(void) +vm_wait(int timo) { mtx_lock(&vm_page_queue_free_mtx); - _vm_wait(); + _vm_wait(timo); } /* @@ -2696,7 +2696,7 @@ if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); - _vm_wait(); + _vm_wait(0); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) @@ -2719,11 +2719,11 @@ * this balance without careful testing first. */ void -vm_waitpfault(void) +vm_waitpfault(int timo) { mtx_lock(&vm_page_queue_free_mtx); - pagedaemon_wait(PUSER, "pfault"); + pagedaemon_wait(PUSER, "pfault", timo); } struct vm_pagequeue * Index: sys/vm/vm_pageout.h =================================================================== --- sys/vm/vm_pageout.h +++ sys/vm/vm_pageout.h @@ -80,7 +80,8 @@ extern bool vm_pages_needed; #define VM_OOM_MEM 1 -#define VM_OOM_SWAPZ 2 +#define VM_OOM_MEM_PF 2 +#define VM_OOM_SWAPZ 3 /* * vm_lowmem flags. @@ -96,12 +97,11 @@ * Signal pageout-daemon and wait for it. */ -void pagedaemon_wait(int pri, const char *wmesg); +void pagedaemon_wait(int pri, const char *wmesg, int timo); void pagedaemon_wakeup(void); -#define VM_WAIT vm_wait() -#define VM_WAITPFAULT vm_waitpfault() -void vm_wait(void); -void vm_waitpfault(void); +#define VM_WAIT vm_wait(0) +void vm_wait(int timo); +void vm_waitpfault(int timo); #ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -1648,6 +1648,13 @@ return (res); } +static int vm_oom_ratelim_count; +static int vm_oom_ratelim_last; +static int vm_oom_pf_secs = 10; +SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0, + ""); +static struct mtx vm_oom_ratelim_mtx; + void vm_pageout_oom(int shortage) { @@ -1655,9 +1662,32 @@ vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; + int now; bool breakout; /* + * For OOM requests originating from vm_fault(), there is a high + * chance that a single large process faults simultaneously in + * several threads. Also, on an active system running many + * processes of middle-size, like buildworld, all of them + * could fault almost simultaneously as well. + * + * To avoid killing too many processes, rate-limit OOMs + * initiated by vm_fault() time-outs on the waits for free + * pages. + */ + mtx_lock(&vm_oom_ratelim_mtx); + now = ticks; + if ((u_int)(now - vm_oom_ratelim_last) >= hz * vm_oom_pf_secs) { + vm_oom_ratelim_last = now; + vm_oom_ratelim_count = 0; + } else if (vm_oom_ratelim_count++ > 0 && shortage == VM_OOM_MEM_PF) { + mtx_unlock(&vm_oom_ratelim_mtx); + return; + } + mtx_unlock(&vm_oom_ratelim_mtx); + + /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes @@ -1721,7 +1751,7 @@ continue; } size = vmspace_swap_count(vm); - if (shortage == VM_OOM_MEM) + if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); @@ -1923,6 +1953,7 @@ int i; #endif + mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, 0, 0, "laundry: dom0"); @@ -1966,7 +1997,7 @@ * This function returns with the free queues mutex unlocked. */ void -pagedaemon_wait(int pri, const char *wmesg) +pagedaemon_wait(int pri, const char *wmesg, int timo) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); @@ -1983,5 +2014,5 @@ } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri, - wmesg, 0); + wmesg, timo * hz); } Index: sys/vm/vm_swapout.c =================================================================== --- sys/vm/vm_swapout.c +++ sys/vm/vm_swapout.c @@ -152,6 +152,11 @@ &swap_idle_threshold2, 0, "Time before a process will be swapped out"); +static int swapper_swapin_oom_timeout = 1; +SYSCTL_INT(_vm, OID_AUTO, swapper_swapin_oom_timeout, CTLFLAG_RW, + &swapper_swapin_oom_timeout, 0, + "Interval for swapper to try faultin killed processes on OOM"); + static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; @@ -164,7 +169,7 @@ static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long); static void swapout_procs(int action); static void vm_req_vmdaemon(int req); -static void vm_thread_swapin(struct thread *td); +static void vm_thread_swapin(struct thread *td, bool oom_swapin); static void vm_thread_swapout(struct thread *td); /* @@ -203,6 +208,8 @@ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; + if (should_yield()) + goto unlock_return; if (vm_page_busied(p)) continue; VM_CNT_INC(v_pdpages); @@ -516,8 +523,10 @@ PRELE(p); } sx_sunlock(&allproc_lock); - if (tryagain != 0 && attempts <= 10) + if (tryagain != 0 && attempts <= 10) { + maybe_yield(); goto again; + } } } @@ -552,20 +561,18 @@ * Bring the kernel stack for a specified thread back in. */ static void -vm_thread_swapin(struct thread *td) +vm_thread_swapin(struct thread *td, bool oom_swapin) { vm_object_t ksobj; vm_page_t ma[KSTACK_MAX_PAGES]; - int pages; + int j, a, count, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_WLOCK(ksobj); - (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma, - pages); + (void)vm_page_grab_pages(ksobj, 0, (oom_swapin ? VM_ALLOC_SYSTEM : + VM_ALLOC_NORMAL) | VM_ALLOC_WIRED, ma, pages); for (int i = 0; i < pages;) { - int j, a, count, rv; - vm_page_assert_xbusied(ma[i]); if (ma[i]->valid == VM_PAGE_BITS_ALL) { vm_page_xunbusy(ma[i]); @@ -592,8 +599,8 @@ cpu_thread_swapin(td); } -void -faultin(struct proc *p) +static void +faultin1(struct proc *p, bool oom_swapin) { struct thread *td; @@ -622,7 +629,7 @@ * swapped out. */ FOREACH_THREAD_IN_PROC(p, td) - vm_thread_swapin(td); + vm_thread_swapin(td, oom_swapin); PROC_LOCK(p); swapclear(p); p->p_swtick = ticks; @@ -634,6 +641,13 @@ } } +void +faultin(struct proc *p) +{ + + faultin1(p, false); +} + /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long @@ -645,14 +659,38 @@ struct proc *p; struct thread *td; struct proc *pp; - int slptime; - int swtime; - int ppri; - int pri; + int ppri, pri, slptime, swtime; loop: if (vm_page_count_min()) { - VM_WAIT; + /* + * We are low on memory. A swapped-out process might + * have mapped a large portion of the system's pages + * as anonymous memory. There is no other way to + * release the memory other than to kill the process, + * for which we need to swap it in. + */ + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + /* + * Ensure that the pages for kernel stacks are + * allocated with higher priority. + */ + if (p->p_state == PRS_NORMAL && (p->p_flag & + (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM | + P_WKILLED)) == P_WKILLED) { + sx_sunlock(&allproc_lock); + faultin1(p, true); + PROC_UNLOCK(p); + goto loop; + } + + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + + vm_wait(swapper_swapin_oom_timeout * hz); goto loop; } @@ -715,8 +753,7 @@ } /* - * We would like to bring someone in. (only if there is space). - * [What checks the space? ] + * We would like to bring someone in. */ faultin(p); PROC_UNLOCK(p);