Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -2775,7 +2775,7 @@ RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); - vm_wait(NULL); + vm_wait(NULL, 0); PMAP_LOCK(pmap); } Index: sys/compat/linuxkpi/common/src/linux_page.c =================================================================== --- sys/compat/linuxkpi/common/src/linux_page.c +++ sys/compat/linuxkpi/common/src/linux_page.c @@ -101,7 +101,7 @@ if (flags & M_WAITOK) { if (!vm_page_reclaim_contig(req, npages, 0, pmax, PAGE_SIZE, 0)) { - vm_wait(NULL); + vm_wait(NULL, 0); } flags &= ~M_WAITOK; goto retry; Index: sys/dev/drm2/i915/i915_gem.c =================================================================== --- sys/dev/drm2/i915/i915_gem.c +++ sys/dev/drm2/i915/i915_gem.c @@ -1561,7 +1561,7 @@ i915_gem_object_unpin(obj); DRM_UNLOCK(dev); VM_OBJECT_WUNLOCK(vm_obj); - vm_wait(vm_obj); + vm_wait(vm_obj, 0); goto retry; } page->valid = VM_PAGE_BITS_ALL; Index: sys/dev/drm2/i915/i915_gem_gtt.c =================================================================== --- sys/dev/drm2/i915/i915_gem_gtt.c +++ sys/dev/drm2/i915/i915_gem_gtt.c @@ -589,7 +589,7 @@ if (tries < 1) { if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff, PAGE_SIZE, 0)) - vm_wait(NULL); + vm_wait(NULL, 0); tries++; goto retry; } Index: sys/dev/drm2/ttm/ttm_bo_vm.c =================================================================== --- sys/dev/drm2/ttm/ttm_bo_vm.c +++ sys/dev/drm2/ttm/ttm_bo_vm.c @@ -246,7 +246,7 @@ if (m1 == NULL) { if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) { VM_OBJECT_WUNLOCK(vm_obj); - vm_wait(vm_obj); + vm_wait(vm_obj, 0); VM_OBJECT_WLOCK(vm_obj); ttm_mem_io_unlock(man); ttm_bo_unreserve(bo); Index: sys/dev/drm2/ttm/ttm_page_alloc.c =================================================================== --- sys/dev/drm2/ttm/ttm_page_alloc.c +++ sys/dev/drm2/ttm/ttm_page_alloc.c @@ -168,7 +168,7 @@ return (p); if (!vm_page_reclaim_contig(req, 1, 0, 0xffffffff, PAGE_SIZE, 0)) - vm_wait(NULL); + vm_wait(NULL, 0); } } @@ -181,7 +181,7 @@ p = vm_page_alloc(NULL, 0, req); if (p != NULL) break; - vm_wait(NULL); + vm_wait(NULL, 0); } pmap_page_set_memattr(p, memattr); return (p); Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c +++ sys/i386/i386/pmap.c @@ -2034,7 +2034,7 @@ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) { - vm_wait(NULL); + vm_wait(NULL, 0); } else { pmap->pm_ptdpg[i] = m; #if defined(PAE) || defined(PAE_TABLES) @@ -2078,7 +2078,7 @@ if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); - vm_wait(NULL); + vm_wait(NULL, 0); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } Index: sys/kern/kern_sig.c =================================================================== --- sys/kern/kern_sig.c +++ sys/kern/kern_sig.c @@ -3066,6 +3066,22 @@ return (1); } +void +proc_wkilled(struct proc *p) +{ + + PROC_LOCK_ASSERT(p, MA_OWNED); + if ((p->p_flag & P_WKILLED) == 0) { + p->p_flag |= P_WKILLED; + /* + * Notify swapper that there is a process to swap in. + * The notification is racy, at worst it would take 10 + * seconds for the swapper process to notice. + */ + wakeup(&proc0); + } +} + /* * Kill the current process for stated reason. */ @@ -3078,7 +3094,7 @@ p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); - p->p_flag |= P_WKILLED; + proc_wkilled(p); kern_psignal(p, SIGKILL); } Index: sys/kern/sys_process.c =================================================================== --- sys/kern/sys_process.c +++ sys/kern/sys_process.c @@ -1170,7 +1170,7 @@ * queue cannot accommodate any new signals. */ if (data == SIGKILL) - p->p_flag |= P_WKILLED; + proc_wkilled(p); /* * Unsuspend all threads. To leave a thread Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -1050,6 +1050,7 @@ void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); void proc_set_traced(struct proc *p, bool stop); +void proc_wkilled(struct proc *p); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -134,6 +134,16 @@ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked); +static int vm_pfault_oom_attempts = 3; +SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, + &vm_pfault_oom_attempts, 0, + ""); + +static int vm_pfault_oom_wait = 10; +SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, + &vm_pfault_oom_wait, 0, + ""); + static inline void release_page(struct faultstate *fs) { @@ -552,7 +562,7 @@ vm_pindex_t retry_pindex; vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; - int locked, nera, result, rv; + int locked, nera, oom, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, hardfault, is_first_object_locked; @@ -563,7 +573,9 @@ nera = -1; hardfault = false; -RetryFault:; +RetryFault: + oom = 0; +RetryFault_oom: /* * Find the backing store object and offset into it to begin the @@ -805,7 +817,17 @@ } if (fs.m == NULL) { unlock_and_deallocate(&fs); - vm_waitpfault(); + if (vm_pfault_oom_attempts < 0 || + oom < vm_pfault_oom_attempts) { + oom++; + vm_waitpfault(vm_pfault_oom_wait * hz); + goto RetryFault_oom; + } + if (bootverbose) + printf( + "proc %d (%s) failed to alloc page on fault, starting OOM\n", + curproc->p_pid, curproc->p_comm); + vm_pageout_oom(VM_OOM_MEM_PF); goto RetryFault; } } @@ -1719,7 +1741,7 @@ if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); - vm_wait(dst_object); + vm_wait(dst_object, 0); VM_OBJECT_WLOCK(dst_object); goto again; } Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -2899,13 +2899,13 @@ * Wait for free pages to exceed the min threshold globally. */ void -vm_wait_min(void) +vm_wait_min(int timo) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; - msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); + msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", timo); } mtx_unlock(&vm_domainset_lock); } @@ -2934,7 +2934,7 @@ } static void -vm_wait_doms(const domainset_t *wdoms) +vm_wait_doms(const domainset_t *wdoms, int timo) { /* @@ -2960,7 +2960,7 @@ if (DOMAINSET_SUBSET(&vm_min_domains, wdoms)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, - "vmwait", 0); + "vmwait", timo); } mtx_unlock(&vm_domainset_lock); } @@ -2994,7 +2994,7 @@ panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); - vm_wait_doms(&wdom); + vm_wait_doms(&wdom, 0); } } @@ -3007,7 +3007,7 @@ * Called in various places after failed memory allocations. */ void -vm_wait(vm_object_t obj) +vm_wait(vm_object_t obj, int timo) { struct domainset *d; @@ -3022,7 +3022,7 @@ if (d == NULL) d = curthread->td_domain.dr_policy; - vm_wait_doms(&d->ds_mask); + vm_wait_doms(&d->ds_mask, timo); } /* @@ -3067,13 +3067,14 @@ * this balance without careful testing first. */ void -vm_waitpfault(void) +vm_waitpfault(int timo) { mtx_lock(&vm_domainset_lock); if (vm_page_count_min()) { vm_min_waiters++; - msleep(&vm_min_domains, &vm_domainset_lock, PUSER, "pfault", 0); + msleep(&vm_min_domains, &vm_domainset_lock, PUSER, "pfault", + timo); } mtx_unlock(&vm_domainset_lock); } Index: sys/vm/vm_pageout.h =================================================================== --- sys/vm/vm_pageout.h +++ sys/vm/vm_pageout.h @@ -79,7 +79,8 @@ extern int vm_pageout_page_count; #define VM_OOM_MEM 1 -#define VM_OOM_SWAPZ 2 +#define VM_OOM_MEM_PF 2 +#define VM_OOM_SWAPZ 3 /* * vm_lowmem flags. @@ -95,10 +96,10 @@ * Signal pageout-daemon and wait for it. */ -void vm_wait(vm_object_t obj); -void vm_waitpfault(void); +void vm_wait(vm_object_t obj, int timo); +void vm_waitpfault(int timo); void vm_wait_domain(int domain); -void vm_wait_min(void); +void vm_wait_min(int timo); void vm_wait_severe(void); int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -1752,6 +1752,13 @@ return (res); } +static int vm_oom_ratelim_count; +static int vm_oom_ratelim_last; +static int vm_oom_pf_secs = 10; +SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0, + ""); +static struct mtx vm_oom_ratelim_mtx; + void vm_pageout_oom(int shortage) { @@ -1759,8 +1766,31 @@ vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; + int now; bool breakout; + /* + * For OOM requests originating from vm_fault(), there is a high + * chance that a single large process faults simultaneously in + * several threads. Also, on an active system running many + * processes of middle-size, like buildworld, all of them + * could fault almost simultaneously as well. + * + * To avoid killing too many processes, rate-limit OOMs + * initiated by vm_fault() time-outs on the waits for free + * pages. + */ + mtx_lock(&vm_oom_ratelim_mtx); + now = ticks; + if ((u_int)(now - vm_oom_ratelim_last) >= hz * vm_oom_pf_secs) { + vm_oom_ratelim_last = now; + vm_oom_ratelim_count = 0; + } else if (vm_oom_ratelim_count++ > 0 && shortage == VM_OOM_MEM_PF) { + mtx_unlock(&vm_oom_ratelim_mtx); + return; + } + mtx_unlock(&vm_oom_ratelim_mtx); + /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of @@ -1825,7 +1855,7 @@ continue; } size = vmspace_swap_count(vm); - if (shortage == VM_OOM_MEM) + if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); @@ -2064,6 +2094,7 @@ int error; int i; + mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); snprintf(curthread->td_name, sizeof(curthread->td_name), "dom0"); error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, Index: sys/vm/vm_swapout.c =================================================================== --- sys/vm/vm_swapout.c +++ sys/vm/vm_swapout.c @@ -152,6 +152,11 @@ &swap_idle_threshold2, 0, "Time before a process will be swapped out"); +static int swapper_swapin_oom_timeout = 1; +SYSCTL_INT(_vm, OID_AUTO, swapper_swapin_oom_timeout, CTLFLAG_RW, + &swapper_swapin_oom_timeout, 0, + "Interval for swapper to try faultin killed processes on OOM"); + static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; @@ -164,7 +169,7 @@ static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long); static void swapout_procs(int action); static void vm_req_vmdaemon(int req); -static void vm_thread_swapin(struct thread *td); +static void vm_thread_swapin(struct thread *td, bool oom_swapin); static void vm_thread_swapout(struct thread *td); /* @@ -563,7 +568,7 @@ * Bring the kernel stack for a specified thread back in. */ static void -vm_thread_swapin(struct thread *td) +vm_thread_swapin(struct thread *td, bool oom_swapin) { vm_object_t ksobj; vm_page_t ma[KSTACK_MAX_PAGES]; @@ -572,8 +577,8 @@ pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_WLOCK(ksobj); - (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma, - pages); + (void)vm_page_grab_pages(ksobj, 0, (oom_swapin ? VM_ALLOC_SYSTEM : + VM_ALLOC_NORMAL) | VM_ALLOC_WIRED, ma, pages); for (i = 0; i < pages;) { vm_page_assert_xbusied(ma[i]); if (ma[i]->valid == VM_PAGE_BITS_ALL) { @@ -601,8 +606,8 @@ cpu_thread_swapin(td); } -void -faultin(struct proc *p) +static void +faultin1(struct proc *p, bool oom_swapin) { struct thread *td; @@ -631,7 +636,7 @@ * swapped out. */ FOREACH_THREAD_IN_PROC(p, td) - vm_thread_swapin(td); + vm_thread_swapin(td, oom_swapin); PROC_LOCK(p); swapclear(p); p->p_swtick = ticks; @@ -643,6 +648,15 @@ } } +void +faultin(struct proc *p) +{ + + faultin1(p, false); +} + +int wkilled; + /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long @@ -653,21 +667,34 @@ { struct proc *p, *pp; struct thread *td; - int ppri, pri, slptime, swtime; + int min_flag, ppri, pri, slptime, swtime; loop: - if (vm_page_count_min()) { - vm_wait_min(); - goto loop; - } - pp = NULL; ppri = INT_MIN; + min_flag = vm_page_count_min(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); - if (p->p_state == PRS_NEW || - p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) { + if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT | + P_SWAPPINGIN | P_INMEM)) != 0) { + PROC_UNLOCK(p); + continue; + } + if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) { + /* + * A swapped-out process might have mapped a + * large ​portion of the system's pages as + * anonymous memory. There is no other way to + * release the memory other ​than to kill the + * process, for which we need to swap ​it in. + */ + sx_sunlock(&allproc_lock); + faultin1(p, true); + PROC_UNLOCK(p); + goto loop; + } + if (min_flag) { PROC_UNLOCK(p); continue; } Index: sys/vm/vm_swapout_dummy.c =================================================================== --- sys/vm/vm_swapout_dummy.c +++ sys/vm/vm_swapout_dummy.c @@ -115,10 +115,14 @@ panic("faultin: proc %p swapped out with NO_SWAPPING", p); } +int wkilled; + void swapper(void) { - for (;;) + for (;;) { + wkilled = 0; tsleep(&proc0, PVM, "swapin", MAXSLP * hz); + } }