Index: head/sys/sys/vmmeter.h =================================================================== --- head/sys/sys/vmmeter.h +++ head/sys/sys/vmmeter.h @@ -187,6 +187,13 @@ return (!DOMAINSET_EMPTY(&vm_severe_domains)); } +static inline int +vm_page_count_severe_set(domainset_t *mask) +{ + + return (DOMAINSET_SUBSET(&vm_severe_domains, mask)); +} + /* * Return TRUE if we are under our minimum low-free-pages threshold. * Index: head/sys/vm/vm_domainset.h =================================================================== --- head/sys/vm/vm_domainset.h +++ head/sys/vm/vm_domainset.h @@ -34,9 +34,10 @@ struct domainset *di_domain; int *di_iter; vm_pindex_t di_offset; - int di_policy; int di_flags; - int di_n; + uint16_t di_policy; + domainid_t di_n; + bool di_minskip; }; int vm_domainset_iter_page(struct vm_domainset_iter *, int *, int *); @@ -45,5 +46,7 @@ int vm_domainset_iter_malloc(struct vm_domainset_iter *, int *, int *); void vm_domainset_iter_malloc_init(struct vm_domainset_iter *, struct vm_object *, int *, int *); + +void vm_wait_doms(const domainset_t *); #endif /* __VM_DOMAINSET_H__ */ Index: head/sys/vm/vm_domainset.c =================================================================== --- head/sys/vm/vm_domainset.c +++ head/sys/vm/vm_domainset.c @@ -100,6 +100,8 @@ pindex += (((uintptr_t)obj) / sizeof(*obj)); di->di_offset = pindex; } + /* Skip zones below min on the first pass. */ + di->di_minskip = true; } static void @@ -213,6 +215,8 @@ *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) | VM_ALLOC_NOWAIT; vm_domainset_iter_first(di, domain); + if (DOMAINSET_ISSET(*domain, &vm_min_domains)) + vm_domainset_iter_page(di, domain, req); } int @@ -227,8 +231,15 @@ return (ENOMEM); /* If there are more domains to visit we run the iterator. */ - if (--di->di_n != 0) { + while (--di->di_n != 0) { vm_domainset_iter_next(di, domain); + if (!di->di_minskip || + !DOMAINSET_ISSET(*domain, &vm_min_domains)) + return (0); + } + if (di->di_minskip) { + di->di_minskip = false; + vm_domainset_iter_first(di, domain); return (0); } @@ -258,6 +269,8 @@ di->di_flags = *flags; *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT; vm_domainset_iter_first(di, domain); + if (DOMAINSET_ISSET(*domain, &vm_min_domains)) + vm_domainset_iter_malloc(di, domain, flags); } int @@ -265,8 +278,17 @@ { /* If there are more domains to visit we run the iterator. */ - if (--di->di_n != 0) { + while (--di->di_n != 0) { vm_domainset_iter_next(di, domain); + if (!di->di_minskip || + !DOMAINSET_ISSET(*domain, &vm_min_domains)) + return (0); + } + + /* If we skipped zones below min start the search from the beginning. */ + if (di->di_minskip) { + di->di_minskip = false; + vm_domainset_iter_first(di, domain); return (0); } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c +++ head/sys/vm/vm_fault.c @@ -548,6 +548,7 @@ { struct faultstate fs; struct vnode *vp; + struct domainset *dset; vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; vm_pindex_t retry_pindex; @@ -791,7 +792,11 @@ * there, and allocation can fail, causing * restart and new reading of the p_flag. */ - if (!vm_page_count_severe() || P_KILLED(curproc)) { + dset = fs.object->domain.dr_policy; + if (dset == NULL) + dset = curthread->td_domain.dr_policy; + if (!vm_page_count_severe_set(&dset->ds_mask) || + P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs.object, atop(vaddr) - fs.pindex); @@ -806,7 +811,7 @@ } if (fs.m == NULL) { unlock_and_deallocate(&fs); - vm_waitpfault(); + vm_waitpfault(dset); goto RetryFault; } } Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c +++ head/sys/vm/vm_glue.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include #include @@ -534,6 +535,7 @@ struct vmspace *vm2, int flags) { struct proc *p1 = td->td_proc; + struct domainset *dset; int error; if ((flags & RFPROC) == 0) { @@ -557,9 +559,9 @@ p2->p_vmspace = p1->p_vmspace; atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); } - - while (vm_page_count_severe()) { - vm_wait_severe(); + dset = td2->td_domain.dr_policy; + while (vm_page_count_severe_set(&dset->ds_mask)) { + vm_wait_doms(&dset->ds_mask); } if ((flags & RFMEM) == 0) { Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c +++ head/sys/vm/vm_page.c @@ -2935,7 +2935,7 @@ return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } -static void +void vm_wait_doms(const domainset_t *wdoms) { @@ -2961,10 +2961,10 @@ mtx_lock(&vm_domainset_lock); if (DOMAINSET_SUBSET(&vm_min_domains, wdoms)) { vm_min_waiters++; - msleep(&vm_min_domains, &vm_domainset_lock, PVM, - "vmwait", 0); - } - mtx_unlock(&vm_domainset_lock); + msleep(&vm_min_domains, &vm_domainset_lock, + PVM | PDROP, "vmwait", 0); + } else + mtx_unlock(&vm_domainset_lock); } } @@ -3069,15 +3069,21 @@ * this balance without careful testing first. */ void -vm_waitpfault(void) +vm_waitpfault(struct domainset *dset) { + /* + * XXX Ideally we would wait only until the allocation could + * be satisfied. This condition can cause new allocators to + * consume all freed pages while old allocators wait. + */ mtx_lock(&vm_domainset_lock); - if (vm_page_count_min()) { + if (DOMAINSET_SUBSET(&vm_min_domains, &dset->ds_mask)) { vm_min_waiters++; - msleep(&vm_min_domains, &vm_domainset_lock, PUSER, "pfault", 0); - } - mtx_unlock(&vm_domainset_lock); + msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, + "pfault", 0); + } else + mtx_unlock(&vm_domainset_lock); } struct vm_pagequeue * Index: head/sys/vm/vm_pageout.h =================================================================== --- head/sys/vm/vm_pageout.h +++ head/sys/vm/vm_pageout.h @@ -96,7 +96,7 @@ */ void vm_wait(vm_object_t obj); -void vm_waitpfault(void); +void vm_waitpfault(struct domainset *); void vm_wait_domain(int domain); void vm_wait_min(void); void vm_wait_severe(void);