Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c +++ head/sys/vm/vm_page.c @@ -139,14 +139,15 @@ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; + struct mtx_padalign __exclusive_cache_line vm_domainset_lock; +/* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; - /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. @@ -184,7 +185,6 @@ vm_page_t mpred); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); -static void vm_domain_free_wakeup(struct vm_domain *); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); @@ -430,6 +430,7 @@ MTX_DEF | MTX_DUPOK); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); + mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); } @@ -731,8 +732,8 @@ vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_free_contig(m, pagecount); - vm_domain_freecnt_adj(vmd, (int)pagecount); vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd = VM_DOMAIN(seg->domain); @@ -1694,7 +1695,6 @@ struct vm_domain *vmd; vm_page_t m; int flags; - u_int free_count; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && @@ -1747,6 +1747,9 @@ #endif } } + if (m != NULL) + vm_domain_freecnt_dec(vmd, 1); + vm_domain_free_unlock(vmd); if (m == NULL) { /* * Not allocatable, give up. @@ -1760,15 +1763,7 @@ * At this point we had better have found a good page. */ KASSERT(m != NULL, ("missing page")); - free_count = vm_domain_freecnt_adj(vmd, -1); - vm_domain_free_unlock(vmd); - /* - * Don't wakeup too often - wakeup the pageout daemon when - * we would be nearly out of memory. - */ - if (vm_paging_needed(vmd, free_count)) - pagedaemon_wakeup(vmd->vmd_domain); #if VM_NRESERVLEVEL > 0 found: #endif @@ -1804,7 +1799,6 @@ if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - pagedaemon_wakeup(domain); if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->wire_count = 0; @@ -1961,13 +1955,14 @@ goto retry; #endif } + if (m_ret != NULL) + vm_domain_freecnt_dec(vmd, npages); + vm_domain_free_unlock(vmd); if (m_ret == NULL) { if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } - vm_domain_freecnt_adj(vmd, -npages); - vm_domain_free_unlock(vmd); #if VM_NRESERVLEVEL > 0 found: #endif @@ -2006,7 +2001,6 @@ m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - pagedaemon_wakeup(domain); if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, @@ -2035,9 +2029,6 @@ pmap_page_set_memattr(m, memattr); pindex++; } - vmd = VM_DOMAIN(domain); - if (vm_paging_needed(vmd, vmd->vmd_free_count)) - pagedaemon_wakeup(domain); return (m_ret); } @@ -2100,7 +2091,7 @@ { struct vm_domain *vmd; vm_page_t m; - u_int flags, free_count; + u_int flags; /* * Do not allocate reserved pages unless the req has asked for it. @@ -2111,13 +2102,14 @@ if (vm_domain_available(vmd, req, 1)) m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); + if (m != NULL) + vm_domain_freecnt_dec(vmd, 1); + vm_domain_free_unlock(vmd); if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } - free_count = vm_domain_freecnt_adj(vmd, -1); - vm_domain_free_unlock(vmd); vm_page_alloc_check(m); /* @@ -2138,8 +2130,6 @@ } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; - if (vm_paging_needed(vmd, free_count)) - pagedaemon_wakeup(domain); return (m); } @@ -2539,15 +2529,19 @@ if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { + int cnt; + vmd = VM_DOMAIN(domain); + cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_phys_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_page_free_phys(vmd, m); + cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); - vm_domain_free_wakeup(vmd); vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, cnt); } return (error); } @@ -2710,7 +2704,7 @@ /* * Clear the domain from the appropriate page level domainset. */ -static void +void vm_domain_clear(struct vm_domain *vmd) { @@ -2731,6 +2725,22 @@ wakeup(&vm_severe_domains); } } + + /* + * If pageout daemon needs pages, then tell it that there are + * some free. + */ + if (vmd->vmd_pageout_pages_needed && + vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { + wakeup(&vmd->vmd_pageout_pages_needed); + vmd->vmd_pageout_pages_needed = 0; + } + + /* See comments in vm_wait_doms(). */ + if (vm_pageproc_waiters) { + vm_pageproc_waiters = 0; + wakeup(&vm_pageproc_waiters); + } mtx_unlock(&vm_domainset_lock); } @@ -2769,7 +2779,7 @@ vm_wait_count(void) { - return (vm_severe_waiters + vm_min_waiters); + return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } static void @@ -2787,9 +2797,8 @@ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; - msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM, + msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP, "pageprocwait", 1); - mtx_unlock(&vm_domainset_lock); } else { /* * XXX Ideally we would wait only until the allocation could @@ -2819,14 +2828,17 @@ domainset_t wdom; vmd = VM_DOMAIN(domain); - vm_domain_free_assert_locked(vmd); + vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { - vmd->vmd_pageout_pages_needed = 1; - msleep(&vmd->vmd_pageout_pages_needed, - vm_domain_free_lockptr(vmd), PDROP | PSWP, "VMWait", 0); + mtx_lock(&vm_domainset_lock); + if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { + vmd->vmd_pageout_pages_needed = 1; + msleep(&vmd->vmd_pageout_pages_needed, + &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); + } else + mtx_unlock(&vm_domainset_lock); } else { - vm_domain_free_unlock(vmd); if (pageproc == NULL) panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); @@ -2876,7 +2888,7 @@ vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { - vm_domain_free_assert_locked(vmd); + vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); @@ -2888,10 +2900,8 @@ VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); - } else { - vm_domain_free_unlock(vmd); - pagedaemon_wakeup(vmd->vmd_domain); } + return (0); } @@ -3063,46 +3073,6 @@ } /* - * vm_domain_free_wakeup: - * - * Helper routine for vm_page_free_toq(). This routine is called - * when a page is added to the free queues. - * - * The page queues must be locked. - */ -static void -vm_domain_free_wakeup(struct vm_domain *vmd) -{ - - vm_domain_free_assert_locked(vmd); - - /* - * if pageout daemon needs pages, then tell it that there are - * some free. - */ - if (vmd->vmd_pageout_pages_needed && - vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { - wakeup(&vmd->vmd_pageout_pages_needed); - vmd->vmd_pageout_pages_needed = 0; - } - /* - * wakeup processes that are waiting on memory if we hit a - * high water mark. And wakeup scheduler process if we have - * lots of memory. this process will swapin processes. - */ - if ((vmd->vmd_minset && !vm_paging_min(vmd)) || - (vmd->vmd_severeset && !vm_paging_severe(vmd))) - vm_domain_clear(vmd); - - /* See comments in vm_wait(); */ - if (vm_pageproc_waiters) { - vm_pageproc_waiters = 0; - wakeup(&vm_pageproc_waiters); - } - -} - -/* * vm_page_free_prep: * * Prepares the given page to be put on the free list, @@ -3183,7 +3153,8 @@ /* * Insert the page into the physical memory allocator's free page - * queues. This is the last step to free a page. + * queues. This is the last step to free a page. The caller is + * responsible for adjusting the free page count. */ static void vm_page_free_phys(struct vm_domain *vmd, vm_page_t m) @@ -3191,7 +3162,6 @@ vm_domain_free_assert_locked(vmd); - vm_domain_freecnt_adj(vmd, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #endif @@ -3203,24 +3173,28 @@ { struct vm_domain *vmd; vm_page_t m; + int cnt; if (TAILQ_EMPTY(tq)) return; vmd = NULL; + cnt = 0; TAILQ_FOREACH(m, tq, listq) { if (vmd != vm_pagequeue_domain(m)) { if (vmd != NULL) { - vm_domain_free_wakeup(vmd); vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, cnt); + cnt = 0; } vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); } vm_page_free_phys(vmd, m); + cnt++; } if (vmd != NULL) { - vm_domain_free_wakeup(vmd); vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, cnt); } } @@ -3243,8 +3217,8 @@ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); vm_page_free_phys(vmd, m); - vm_domain_free_wakeup(vmd); vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, 1); } /* Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c +++ head/sys/vm/vm_pageout.c @@ -1790,7 +1790,13 @@ * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { - vm_domain_free_lock(vmd); + vm_domain_pageout_lock(vmd); + /* + * We need to clear wanted before we check the limits. This + * prevents races with wakers who will check wanted after they + * reach the limit. + */ + atomic_store_int(&vmd->vmd_pageout_wanted, 0); /* * Might the page daemon need to run again? @@ -1801,7 +1807,7 @@ * we have performed a level >= 1 (page reclamation) * scan, then sleep a bit and try again. */ - vm_domain_free_unlock(vmd); + vm_domain_pageout_unlock(vmd); if (pass > 1) pause("pwait", hz / VM_INACT_SCAN_RATE); } else { @@ -1809,12 +1815,18 @@ * No, sleep until the next wakeup or until pages * need to have their reference stats updated. */ - vmd->vmd_pageout_wanted = false; if (mtx_sleep(&vmd->vmd_pageout_wanted, - vm_domain_free_lockptr(vmd), PDROP | PVM, + vm_domain_pageout_lockptr(vmd), PDROP | PVM, "psleep", hz / VM_INACT_SCAN_RATE) == 0) VM_CNT_INC(v_pdwakeups); } + /* Prevent spurious wakeups by ensuring that wanted is set. */ + atomic_store_int(&vmd->vmd_pageout_wanted, 1); + + /* + * Use the controller to calculate how many pages to free in + * this interval. + */ shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count); if (shortage && pass == 0) pass = 1; @@ -1970,10 +1982,14 @@ struct vm_domain *vmd; vmd = VM_DOMAIN(domain); - vm_domain_free_assert_unlocked(vmd); + vm_domain_pageout_assert_unlocked(vmd); + if (curproc == pageproc) + return; - if (!vmd->vmd_pageout_wanted && curthread->td_proc != pageproc) { - vmd->vmd_pageout_wanted = true; + if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) { + vm_domain_pageout_lock(vmd); + atomic_store_int(&vmd->vmd_pageout_wanted, 1); wakeup(&vmd->vmd_pageout_wanted); + vm_domain_pageout_unlock(vmd); } } Index: head/sys/vm/vm_pagequeue.h =================================================================== --- head/sys/vm/vm_pagequeue.h +++ head/sys/vm/vm_pagequeue.h @@ -76,17 +76,31 @@ #include struct sysctl_oid; +/* + * One vm_domain per-numa domain. Contains pagequeues, free page structures, + * and accounting. + * + * Lock Key: + * f vmd_free_mtx + * p vmd_pageout_mtx + * d vm_domainset_lock + * a atomic + * c const after boot +*/ struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; struct mtx_padalign vmd_free_mtx; - struct vmem *vmd_kernel_arena; - u_int vmd_domain; /* Domain number. */ - u_int vmd_page_count; - long vmd_segs; /* bitmask of the segments */ + struct mtx_padalign vmd_pageout_mtx; + struct vmem *vmd_kernel_arena; /* (c) per-domain kva arena. */ + u_int vmd_domain; /* (c) Domain number. */ + u_int vmd_page_count; /* (c) Total page count. */ + long vmd_segs; /* (c) bitmask of the segments */ + u_int __aligned(CACHE_LINE_SIZE) vmd_free_count; /* (a,f) free page count */ + u_int vmd_pageout_deficit; /* (a) Estimated number of pages deficit */ + uint8_t vmd_pad[CACHE_LINE_SIZE - (sizeof(u_int) * 2)]; - /* Paging control variables, locked by domain_free_mtx. */ + /* Paging control variables, used within single threaded page daemon. */ struct pidctrl vmd_pid; /* Pageout controller. */ - u_int vmd_free_count; boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; @@ -94,11 +108,10 @@ struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ - int vmd_pageout_pages_needed; /* page daemon waiting for pages? */ - int vmd_pageout_deficit; /* Estimated number of pages deficit */ - bool vmd_pageout_wanted; /* pageout daemon wait channel */ - bool vmd_minset; /* Are we in vm_min_domains? */ - bool vmd_severeset; /* Are we in vm_severe_domains? */ + int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */ + int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */ + bool vmd_minset; /* (d) Are we in vm_min_domains? */ + bool vmd_severeset; /* (d) Are we in vm_severe_domains? */ int vmd_inactq_scans; enum { VM_LAUNDRY_IDLE = 0, @@ -142,6 +155,17 @@ #define vm_domain_free_unlock(d) \ mtx_unlock(vm_domain_free_lockptr((d))) +#define vm_domain_pageout_lockptr(d) \ + (&(d)->vmd_pageout_mtx) +#define vm_domain_pageout_assert_locked(n) \ + mtx_assert(vm_domain_pageout_lockptr((n)), MA_OWNED) +#define vm_domain_pageout_assert_unlocked(n) \ + mtx_assert(vm_domain_pageout_lockptr((n)), MA_NOTOWNED) +#define vm_domain_pageout_lock(d) \ + mtx_lock(vm_domain_pageout_lockptr((d))) +#define vm_domain_pageout_unlock(d) \ + mtx_unlock(vm_domain_pageout_lockptr((d))) + static __inline void vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { @@ -155,6 +179,7 @@ #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) void vm_domain_set(struct vm_domain *vmd); +void vm_domain_clear(struct vm_domain *vmd); int vm_domain_available(struct vm_domain *vmd, int req, int npages); /* @@ -221,18 +246,40 @@ return (vm_paging_target(vmd)); } -static inline u_int -vm_domain_freecnt_adj(struct vm_domain *vmd, int adj) +void pagedaemon_wakeup(int domain); + +static inline void +vm_domain_freecnt_inc(struct vm_domain *vmd, int adj) { - u_int ret; + u_int old, new; - vm_domain_free_assert_locked(vmd); - ret = vmd->vmd_free_count += adj; - if ((!vmd->vmd_minset && vm_paging_min(vmd)) || - (!vmd->vmd_severeset && vm_paging_severe(vmd))) - vm_domain_set(vmd); + old = atomic_fetchadd_int(&vmd->vmd_free_count, adj); + new = old + adj; + /* + * Only update bitsets on transitions. Notice we short-circuit the + * rest of the checks if we're above min already. + */ + if (old < vmd->vmd_free_min && (new >= vmd->vmd_free_min || + (old < vmd->vmd_free_severe && new >= vmd->vmd_free_severe) || + (old < vmd->vmd_pageout_free_min && + new >= vmd->vmd_pageout_free_min))) + vm_domain_clear(vmd); +} - return (ret); +static inline void +vm_domain_freecnt_dec(struct vm_domain *vmd, int adj) +{ + u_int old, new; + + old = atomic_fetchadd_int(&vmd->vmd_free_count, -adj); + new = old - adj; + KASSERT(new >= 0, ("vm_domain_freecnt_dec: free count underflow")); + if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) + pagedaemon_wakeup(vmd->vmd_domain); + /* Only update bitsets on transitions. */ + if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || + (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) + vm_domain_set(vmd); } Index: head/sys/vm/vm_reserv.c =================================================================== --- head/sys/vm/vm_reserv.c +++ head/sys/vm/vm_reserv.c @@ -593,7 +593,7 @@ } for (i = 0; i < npages; i++) vm_reserv_populate(rv, index + i); - vm_domain_freecnt_adj(vmd, -npages); + vm_domain_freecnt_dec(vmd, npages); out: vm_domain_free_unlock(vmd); return (m); @@ -789,7 +789,7 @@ struct vm_domain *vmd; vm_page_t m, msucc; vm_reserv_t rv; - int index, free_count; + int index; VM_OBJECT_ASSERT_WLOCKED(object); @@ -822,13 +822,9 @@ m = NULL; if (m != NULL) { vm_reserv_populate(rv, index); - free_count = vm_domain_freecnt_adj(vmd, -1); - } else - free_count = vmd->vmd_free_count; + vm_domain_freecnt_dec(vmd, 1); + } vm_domain_free_unlock(vmd); - - if (vm_paging_needed(vmd, free_count)) - pagedaemon_wakeup(domain); return (m); }