Index: sys/amd64/amd64/machdep.c =================================================================== --- sys/amd64/amd64/machdep.c +++ sys/amd64/amd64/machdep.c @@ -279,7 +279,7 @@ memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } - if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) + if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); @@ -306,8 +306,8 @@ vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", - ptoa((uintmax_t)vm_cnt.v_free_count), - ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); + ptoa((uintmax_t)vm_free_count()), + ptoa((uintmax_t)vm_free_count()) / 1048576); /* * Set up buffers, so they can be used to read disk labels. Index: sys/arm/arm/machdep.c =================================================================== --- sys/arm/arm/machdep.c +++ sys/arm/arm/machdep.c @@ -228,8 +228,8 @@ (uintmax_t)arm32_ptob(realmem), (uintmax_t)arm32_ptob(realmem) / mbyte); printf("avail memory = %ju (%ju MB)\n", - (uintmax_t)arm32_ptob(vm_cnt.v_free_count), - (uintmax_t)arm32_ptob(vm_cnt.v_free_count) / mbyte); + (uintmax_t)arm32_ptob(vm_free_count()), + (uintmax_t)arm32_ptob(vm_free_count()) / mbyte); if (bootverbose) { arm_physmem_print_tables(); devmap_print_table(); Index: sys/arm/arm/pmap-v4.c =================================================================== --- sys/arm/arm/pmap-v4.c +++ sys/arm/arm/pmap-v4.c @@ -3817,7 +3817,7 @@ pv_entry_count++; if (pv_entry_count > pv_entry_high_water) - pagedaemon_wakeup(); + pagedaemon_wakeup(0); /* XXX ARM NUMA */ ret_value = uma_zalloc(pvzone, M_NOWAIT); return ret_value; } Index: sys/cddl/compat/opensolaris/sys/kmem.h =================================================================== --- sys/cddl/compat/opensolaris/sys/kmem.h +++ sys/cddl/compat/opensolaris/sys/kmem.h @@ -78,7 +78,7 @@ int kmem_debugging(void); void *calloc(size_t n, size_t s); -#define freemem vm_cnt.v_free_count +#define freemem vm_free_count() #define minfree vm_cnt.v_free_min #define heap_arena kernel_arena #define zio_arena NULL Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -379,7 +379,7 @@ arc_free_target_init(void *unused __unused) { - zfs_arc_free_target = vm_pageout_wakeup_thresh; + zfs_arc_free_target = (vm_cnt.v_free_min / 10) * 11; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); Index: sys/compat/linprocfs/linprocfs.c =================================================================== --- sys/compat/linprocfs/linprocfs.c +++ sys/compat/linprocfs/linprocfs.c @@ -156,7 +156,7 @@ /* * The correct thing here would be: * - memfree = vm_cnt.v_free_count * PAGE_SIZE; + memfree = vm_free_count() * PAGE_SIZE; memused = memtotal - memfree; * * but it might mislead linux binaries into thinking there @@ -178,7 +178,7 @@ * like unstaticizing it just for linprocfs's sake. */ buffers = 0; - cached = vm_cnt.v_inactive_count * PAGE_SIZE; + cached = vm_inactive_count() * PAGE_SIZE; sbuf_printf(sb, "MemTotal: %9lu kB\n" Index: sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- sys/fs/tmpfs/tmpfs_subr.c +++ sys/fs/tmpfs/tmpfs_subr.c @@ -106,7 +106,8 @@ { vm_ooffset_t avail; - avail = swap_pager_avail + vm_cnt.v_free_count - tmpfs_pages_reserved; + /* XXX */ + avail = swap_pager_avail + vm_free_count() - tmpfs_pages_reserved; if (__predict_false(avail < 0)) avail = 0; return (avail); Index: sys/i386/i386/machdep.c =================================================================== --- sys/i386/i386/machdep.c +++ sys/i386/i386/machdep.c @@ -271,7 +271,7 @@ memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } - if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) + if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); @@ -298,8 +298,8 @@ vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", - ptoa((uintmax_t)vm_cnt.v_free_count), - ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); + ptoa((uintmax_t)vm_free_count()), + ptoa((uintmax_t)vm_free_count()) / 1048576); /* * Set up buffers, so they can be used to read disk labels. Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c +++ sys/kern/init_main.c @@ -87,6 +87,7 @@ #include #include +#include #include #include #include @@ -555,7 +556,7 @@ p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ - pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); + pageablemem = ptoa((vm_paddr_t)vm_free_count()); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; Index: sys/kern/subr_vmem.c =================================================================== --- sys/kern/subr_vmem.c +++ sys/kern/subr_vmem.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "opt_vm.h" @@ -72,6 +73,8 @@ #include #include #include +#include +#include #define VMEM_OPTORDER 5 #define VMEM_OPTVALUE (1 << VMEM_OPTORDER) @@ -641,7 +644,7 @@ * possible due to M_USE_RESERVE page allocation. */ if (wait & M_WAITOK) - VM_WAIT; + vm_wait_domain(domain); return (NULL); } mtx_unlock(&vmem_bt_lock); Index: sys/kern/subr_witness.c =================================================================== --- sys/kern/subr_witness.c +++ sys/kern/subr_witness.c @@ -139,7 +139,7 @@ #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ -#define WITNESS_PENDLIST (2048 + MAXCPU) +#define WITNESS_PENDLIST (2048 + (MAXCPU * 4)) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 Index: sys/mips/mips/machdep.c =================================================================== --- sys/mips/mips/machdep.c +++ sys/mips/mips/machdep.c @@ -210,8 +210,8 @@ vm_ksubmap_init(&kmi); printf("avail memory = %ju (%juMB)\n", - ptoa((uintmax_t)vm_cnt.v_free_count), - ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); + ptoa((uintmax_t)vm_free_count()), + ptoa((uintmax_t)vm_free_count()) / 1048576); cpu_init_interrupts(); /* Index: sys/powerpc/booke/pmap.c =================================================================== --- sys/powerpc/booke/pmap.c +++ sys/powerpc/booke/pmap.c @@ -1183,7 +1183,7 @@ pv_entry_count++; if (pv_entry_count > pv_entry_high_water) - pagedaemon_wakeup(); + pagedaemon_wakeup(0); /* XXX powerpc NUMA */ pv = uma_zalloc(pvzone, M_NOWAIT); return (pv); Index: sys/powerpc/powerpc/machdep.c =================================================================== --- sys/powerpc/powerpc/machdep.c +++ sys/powerpc/powerpc/machdep.c @@ -213,8 +213,8 @@ vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", - ptoa((uintmax_t)vm_cnt.v_free_count), - ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); + ptoa((uintmax_t)vm_free_count()), + ptoa((uintmax_t)vm_free_count()) / 1048576); /* * Set up buffers, so they can be used to read disk labels. Index: sys/sparc64/sparc64/machdep.c =================================================================== --- sys/sparc64/sparc64/machdep.c +++ sys/sparc64/sparc64/machdep.c @@ -190,8 +190,8 @@ EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL, SHUTDOWN_PRI_LAST); - printf("avail memory = %lu (%lu MB)\n", vm_cnt.v_free_count * PAGE_SIZE, - vm_cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE)); + printf("avail memory = %lu (%lu MB)\n", vm_free_count() * PAGE_SIZE, + vm_free_count() / ((1024 * 1024) / PAGE_SIZE)); if (bootverbose) printf("machine: %s\n", sparc64_model); Index: sys/sys/vmmeter.h =================================================================== --- sys/sys/vmmeter.h +++ sys/sys/vmmeter.h @@ -141,23 +141,23 @@ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ u_int v_free_severe; /* (c) severe page depletion point */ u_int v_wire_count VMMETER_ALIGNED; /* (a) pages wired down */ - u_int v_active_count VMMETER_ALIGNED; /* (a) pages active */ - u_int v_inactive_count VMMETER_ALIGNED; /* (a) pages inactive */ - u_int v_laundry_count VMMETER_ALIGNED; /* (a) pages eligible for - laundering */ - u_int v_free_count VMMETER_ALIGNED; /* (f) pages free */ }; #endif /* _KERNEL || _WANT_VMMETER */ #ifdef _KERNEL +#include + extern struct vmmeter vm_cnt; -extern u_int vm_pageout_wakeup_thresh; +extern domainset_t vm_min_domains; +extern domainset_t vm_severe_domains; #define VM_CNT_ADD(var, x) counter_u64_add(vm_cnt.var, x) #define VM_CNT_INC(var) VM_CNT_ADD(var, 1) #define VM_CNT_FETCH(var) counter_u64_fetch(vm_cnt.var) +u_int vm_free_count(void); + /* * Return TRUE if we are under our severe low-free-pages threshold * @@ -168,7 +168,7 @@ vm_page_count_severe(void) { - return (vm_cnt.v_free_severe > vm_cnt.v_free_count); + return (!DOMAINSET_EMPTY(&vm_severe_domains)); } /* @@ -184,50 +184,8 @@ vm_page_count_min(void) { - return (vm_cnt.v_free_min > vm_cnt.v_free_count); + return (!DOMAINSET_EMPTY(&vm_min_domains)); } -/* - * Return TRUE if we have not reached our free page target during - * free page recovery operations. - */ -static inline int -vm_page_count_target(void) -{ - - return (vm_cnt.v_free_target > vm_cnt.v_free_count); -} - -/* - * Return the number of pages we need to free-up or cache - * A positive number indicates that we do not have enough free pages. - */ -static inline int -vm_paging_target(void) -{ - - return (vm_cnt.v_free_target - vm_cnt.v_free_count); -} - -/* - * Returns TRUE if the pagedaemon needs to be woken up. - */ -static inline int -vm_paging_needed(u_int free_count) -{ - - return (free_count < vm_pageout_wakeup_thresh); -} - -/* - * Return the number of pages we need to launder. - * A positive number indicates that we have a shortfall of clean pages. - */ -static inline int -vm_laundry_target(void) -{ - - return (vm_paging_target()); -} #endif /* _KERNEL */ #endif /* _SYS_VMMETER_H_ */ Index: sys/vm/swap_pager.c =================================================================== --- sys/vm/swap_pager.c +++ sys/vm/swap_pager.c @@ -2327,7 +2327,7 @@ * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ - if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat) + if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* Index: sys/vm/uma.h =================================================================== --- sys/vm/uma.h +++ sys/vm/uma.h @@ -47,6 +47,7 @@ /* Types and type defs */ struct uma_zone; +struct vm_domain_iterator; /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -3409,7 +3409,7 @@ slab->us_data = (void *)addr; slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC; slab->us_size = size; - slab->us_domain = vm_phys_domidx(PHYS_TO_VM_PAGE( + slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE( pmap_kextract(addr))); uma_total_inc(size); } else { Index: sys/vm/vm_extern.h =================================================================== --- sys/vm/vm_extern.h +++ sys/vm/vm_extern.h @@ -122,5 +122,9 @@ void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); int vm_thread_new(struct thread *td, int pages); +u_int vm_active_count(void); +u_int vm_inactive_count(void); +u_int vm_laundry_count(void); +u_int vm_wait_count(void); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Index: sys/vm/vm_glue.c =================================================================== --- sys/vm/vm_glue.c +++ sys/vm/vm_glue.c @@ -552,7 +552,7 @@ } while (vm_page_count_severe()) { - VM_WAIT; + vm_wait_severe(); } if ((flags & RFMEM) == 0) { Index: sys/vm/vm_init.c =================================================================== --- sys/vm/vm_init.c +++ sys/vm/vm_init.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include #include @@ -196,7 +197,7 @@ if (!vm_page_reclaim_contig_domain(domain, pflags, 1, low, high, PAGE_SIZE, 0) && (flags & M_WAITOK) != 0) - VM_WAIT; + vm_wait_domain(domain); VM_OBJECT_WLOCK(object); tries++; goto retry; @@ -205,9 +206,9 @@ vmem_free(vmem, addr, size); return (0); } - KASSERT(vm_phys_domidx(m) == domain, + KASSERT(vm_phys_domain(m) == domain, ("kmem_alloc_attr_domain: Domain mismatch %d != %d", - vm_phys_domidx(m), domain)); + vm_phys_domain(m), domain)); if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; @@ -280,7 +281,7 @@ if (!vm_page_reclaim_contig_domain(domain, pflags, npages, low, high, alignment, boundary) && (flags & M_WAITOK) != 0) - VM_WAIT; + vm_wait_domain(domain); VM_OBJECT_WLOCK(object); tries++; goto retry; @@ -288,9 +289,9 @@ vmem_free(vmem, addr, size); return (0); } - KASSERT(vm_phys_domidx(m) == domain, + KASSERT(vm_phys_domain(m) == domain, ("kmem_alloc_contig_domain: Domain mismatch %d != %d", - vm_phys_domidx(m), domain)); + vm_phys_domain(m), domain)); end_m = m + npages; tmp = addr; for (; m < end_m; m++) { @@ -452,9 +453,9 @@ kmem_unback(object, addr, i); return (KERN_NO_SPACE); } - KASSERT(vm_phys_domidx(m) == domain, + KASSERT(vm_phys_domain(m) == domain, ("kmem_back_domain: Domain mismatch %d != %d", - vm_phys_domidx(m), domain)); + vm_phys_domain(m), domain)); if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT((m->oflags & VPO_UNMANAGED) != 0, @@ -514,7 +515,7 @@ end = offset + size; VM_OBJECT_WLOCK(object); m = vm_page_lookup(object, atop(offset)); - domain = vm_phys_domidx(m); + domain = vm_phys_domain(m); for (; offset < end; offset += PAGE_SIZE, m = next) { next = vm_page_next(m); vm_page_unwire(m, PQ_NONE); Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c +++ sys/vm/vm_map.c @@ -2016,7 +2016,7 @@ * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && - vm_cnt.v_free_count < vm_cnt.v_free_reserved) || + vm_page_count_severe()) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; Index: sys/vm/vm_meter.c =================================================================== --- sys/vm/vm_meter.c +++ sys/vm/vm_meter.c @@ -53,6 +53,8 @@ #include #include #include +#include +#include #include #include #include @@ -213,9 +215,6 @@ total.t_dw++; else total.t_sl++; - if (td->td_wchan == - &vm_cnt.v_free_count) - total.t_pw++; } break; case TDS_CAN_RUN: @@ -283,7 +282,8 @@ } } mtx_unlock(&vm_object_list_mtx); - total.t_free = vm_cnt.v_free_count; + total.t_pw = vm_wait_count(); + total.t_free = vm_free_count(); #if defined(COMPAT_FREEBSD11) /* sysctl(8) allocates twice as much memory as reported by sysctl(3) */ if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen == @@ -339,7 +339,7 @@ #define VM_STATS(parent, var, descr) \ SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 | CTLFLAG_MPSAFE | \ - CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr); + CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) @@ -379,19 +379,36 @@ VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()"); VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel"); +static int +sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS) +{ + u_int (*fn)(void); + uint32_t val; + + fn = arg1; + val = fn(); + return (SYSCTL_OUT(req, &val, sizeof(val))); +} + +#define VM_STATS_PROC(var, descr, fn) \ + SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 | CTLFLAG_MPSAFE | \ + CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr) + #define VM_STATS_UINT(var, descr) \ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) + VM_STATS_UINT(v_page_size, "Page size in bytes"); VM_STATS_UINT(v_page_count, "Total number of pages in system"); VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock"); VM_STATS_UINT(v_free_target, "Pages desired free"); VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold"); -VM_STATS_UINT(v_free_count, "Free pages"); +VM_STATS_PROC(v_free_count, "Free pages", vm_free_count); VM_STATS_UINT(v_wire_count, "Wired pages"); -VM_STATS_UINT(v_active_count, "Active pages"); +VM_STATS_PROC(v_active_count, "Active pages", vm_active_count); VM_STATS_UINT(v_inactive_target, "Desired inactive pages"); -VM_STATS_UINT(v_inactive_count, "Inactive pages"); -VM_STATS_UINT(v_laundry_count, "Pages eligible for laundering"); +VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count); +VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering", + vm_laundry_count); VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code"); VM_STATS_UINT(v_free_severe, "Severe page depletion point"); @@ -406,3 +423,52 @@ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); #endif + +u_int +vm_free_count(void) +{ + u_int v; + int i; + + v = 0; + for (i = 0; i < vm_ndomains; i++) + v += vm_dom[i].vmd_free_count; + + return (v); +} + +static +u_int +vm_pagequeue_count(int pq) +{ + u_int v; + int i; + + v = 0; + for (i = 0; i < vm_ndomains; i++) + v += vm_dom[i].vmd_pagequeues[pq].pq_cnt; + + return (v); +} + +u_int +vm_active_count(void) +{ + + return vm_pagequeue_count(PQ_ACTIVE); +} + +u_int +vm_inactive_count(void) +{ + + return vm_pagequeue_count(PQ_INACTIVE); +} + +u_int +vm_laundry_count(void) +{ + + return vm_pagequeue_count(PQ_LAUNDRY); +} + Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -297,6 +297,17 @@ } } +static __inline bool +vm_object_reserv(vm_object_t object) +{ + + if (object != NULL && + (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) == OBJ_COLORED) { + return (true); + } + return (false); +} + void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -96,6 +96,8 @@ #include #include #include +#include +#include #include #include #include Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -218,54 +218,10 @@ #endif SLIST_HEAD(spglist, vm_page); -struct vm_pagequeue { - struct mtx pq_mutex; - struct pglist pq_pl; - int pq_cnt; - u_int * const pq_vcnt; - const char * const pq_name; -} __aligned(CACHE_LINE_SIZE); - - -struct vm_domain { - struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; - struct vmem *vmd_kernel_arena; - u_int vmd_page_count; - u_int vmd_free_count; - long vmd_segs; /* bitmask of the segments */ - boolean_t vmd_oom; - int vmd_oom_seq; - int vmd_last_active_scan; - struct vm_page vmd_laundry_marker; - struct vm_page vmd_marker; /* marker for pagedaemon private use */ - struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ -}; - -extern struct vm_domain vm_dom[MAXMEMDOM]; - -#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) -#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) -#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) -#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) - #ifdef _KERNEL extern vm_page_t bogus_page; - -static __inline void -vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) -{ - -#ifdef notyet - vm_pagequeue_assert_locked(pq); -#endif - pq->pq_cnt += addend; - atomic_add_int(pq->pq_vcnt, addend); -} -#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) -#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) #endif /* _KERNEL */ -extern struct mtx_padalign vm_page_queue_free_mtx; extern struct mtx_padalign pa_lock[]; #if defined(__arm__) Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -115,8 +115,9 @@ #include #include #include -#include #include +#include +#include #include #include #include @@ -131,10 +132,16 @@ */ struct vm_domain vm_dom[MAXMEMDOM]; -struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx; struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; +struct mtx_padalign __exclusive_cache_line vm_domainset_lock; +domainset_t __exclusive_cache_line vm_min_domains; +domainset_t __exclusive_cache_line vm_severe_domains; +static int vm_min_waiters; +static int vm_severe_waiters; +static int vm_pageproc_waiters; + /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. @@ -159,24 +166,22 @@ SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); -/* Is the page daemon waiting for free pages? */ -static int vm_pageout_pages_needed; - static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_phys(vm_page_t m); -static void vm_page_free_wakeup(void); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); -static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, - vm_paddr_t high); -static int vm_page_alloc_fail(vm_object_t object, int req); +static int vm_page_reclaim_run(int req_class, int domain, u_long npages, + vm_page_t m_run, vm_paddr_t high); +static void vm_domain_free_wakeup(struct vm_domain *); +static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, + int req); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); @@ -313,6 +318,7 @@ static void vm_page_blacklist_check(char *list, char *end) { + struct vm_domain *vmd; vm_paddr_t pa; vm_page_t m; char *next; @@ -325,9 +331,10 @@ m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; - mtx_lock(&vm_page_queue_free_mtx); + vmd = vm_pagequeue_domain(m); + vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_unlock(vmd); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) @@ -390,28 +397,23 @@ } static void -vm_page_domain_init(struct vm_domain *vmd) +vm_page_domain_init(int domain) { + struct vm_domain *vmd; struct vm_pagequeue *pq; int i; + vmd = VM_DOMAIN(domain); + bzero(vmd, sizeof(*vmd)); *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; - *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = - &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; - *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = - &vm_cnt.v_active_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; - *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = - &vm_cnt.v_laundry_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; - /* Unswappable dirty pages are counted as being in the laundry. */ - *__DECONST(int **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_vcnt) = - &vm_cnt.v_laundry_count; + vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; @@ -422,6 +424,7 @@ mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } + mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); } /* @@ -458,7 +461,6 @@ vm_offset_t vm_page_startup(vm_offset_t vaddr) { - struct vm_domain *vmd; struct vm_phys_seg *seg; vm_page_t m; char *list, *listend; @@ -489,11 +491,11 @@ /* * Initialize the page and queue locks. */ - mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); + mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) - vm_page_domain_init(&vm_dom[i]); + vm_page_domain_init(i); /* * Almost all of the pages needed for bootstrapping UMA are used @@ -691,7 +693,6 @@ * physical memory allocator's free lists. */ vm_cnt.v_page_count = 0; - vm_cnt.v_free_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; @@ -706,6 +707,8 @@ * or doesn't overlap any of them. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { + struct vm_domain *vmd; + if (seg->start < phys_avail[i] || seg->end > phys_avail[i + 1]) continue; @@ -713,13 +716,14 @@ m = seg->first_page; pagecount = (u_long)atop(seg->end - seg->start); - mtx_lock(&vm_page_queue_free_mtx); + vmd = VM_DOMAIN(seg->domain); + vm_domain_free_lock(vmd); vm_phys_free_contig(m, pagecount); - vm_phys_freecnt_adj(m, (int)pagecount); - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_freecnt_adj(vmd, (int)pagecount); + vm_domain_free_unlock(vmd); vm_cnt.v_page_count += (u_int)pagecount; - vmd = &vm_dom[seg->domain]; + vmd = VM_DOMAIN(seg->domain);; vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << m->segind; break; @@ -1644,12 +1648,40 @@ return (m); } +/* + * Returns true if the number of free pages exceeds the minimum + * for the request class and false otherwise. + */ +int +vm_domain_available(struct vm_domain *vmd, int req, int npages) +{ + + vm_domain_free_assert_locked(vmd); + req = req & VM_ALLOC_CLASS_MASK; + + /* + * The page daemon is allowed to dig deeper into the free page list. + */ + if (curproc == pageproc && req != VM_ALLOC_INTERRUPT) + req = VM_ALLOC_SYSTEM; + + if (vmd->vmd_free_count >= npages + vmd->vmd_free_reserved || + (req == VM_ALLOC_SYSTEM && + vmd->vmd_free_count >= npages + vmd->vmd_interrupt_free_min) || + (req == VM_ALLOC_INTERRUPT && + vmd->vmd_free_count >= npages)) + return (1); + + return (0); +} + vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { + struct vm_domain *vmd; vm_page_t m; - int flags, req_class; + int flags; u_int free_count; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && @@ -1665,34 +1697,27 @@ if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); - req_class = req & VM_ALLOC_CLASS_MASK; - - /* - * The page daemon is allowed to dig deeper into the free page list. - */ - if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) - req_class = VM_ALLOC_SYSTEM; - - /* - * Allocate a page if the number of free pages exceeds the minimum - * for the request class. - */ again: m = NULL; - mtx_lock(&vm_page_queue_free_mtx); - if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || - (req_class == VM_ALLOC_SYSTEM && - vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || - (req_class == VM_ALLOC_INTERRUPT && - vm_cnt.v_free_count > 0)) { +#if VM_NRESERVLEVEL > 0 + if (vm_object_reserv(object) && + (m = vm_reserv_extend(req, object, pindex, domain, mpred)) + != NULL) { + domain = vm_phys_domain(m); + vmd = VM_DOMAIN(domain); + goto found; + } +#endif + vmd = VM_DOMAIN(domain); + vm_domain_free_lock(vmd); + if (vm_domain_available(vmd, req, 1)) { /* * Can we allocate the page from a reservation? */ #if VM_NRESERVLEVEL > 0 - if (object == NULL || (object->flags & (OBJ_COLORED | - OBJ_FICTITIOUS)) != OBJ_COLORED || (m = - vm_reserv_alloc_page(object, pindex, domain, - mpred)) == NULL) + if (!vm_object_reserv(object) || + (m = vm_reserv_alloc_page(object, pindex, + domain, mpred)) == NULL) #endif { /* @@ -1714,7 +1739,7 @@ /* * Not allocatable, give up. */ - if (vm_page_alloc_fail(object, req)) + if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } @@ -1723,8 +1748,18 @@ * At this point we had better have found a good page. */ KASSERT(m != NULL, ("missing page")); - free_count = vm_phys_freecnt_adj(m, -1); - mtx_unlock(&vm_page_queue_free_mtx); + free_count = vm_domain_freecnt_adj(vmd, -1); + vm_domain_free_unlock(vmd); + + /* + * Don't wakeup too often - wakeup the pageout daemon when + * we would be nearly out of memory. + */ + if (vm_paging_needed(vmd, free_count)) + pagedaemon_wakeup(vmd->vmd_domain); +#if VM_NRESERVLEVEL > 0 +found: +#endif vm_page_alloc_check(m); /* @@ -1757,7 +1792,7 @@ if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - pagedaemon_wakeup(); + pagedaemon_wakeup(domain); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; @@ -1782,13 +1817,6 @@ } else m->pindex = pindex; - /* - * Don't wakeup too often - wakeup the pageout daemon when - * we would be nearly out of memory. - */ - if (vm_paging_needed(free_count)) - pagedaemon_wakeup(); - return (m); } @@ -1856,9 +1884,9 @@ int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { + struct vm_domain *vmd; vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; - int req_class; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && @@ -1876,14 +1904,7 @@ object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); - req_class = req & VM_ALLOC_CLASS_MASK; - /* - * The page daemon is allowed to dig deeper into the free page list. - */ - if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) - req_class = VM_ALLOC_SYSTEM; - if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, @@ -1895,19 +1916,25 @@ * below the lower bound for the allocation class? */ again: +#if VM_NRESERVLEVEL > 0 + if (vm_object_reserv(object) && + (m_ret = vm_reserv_extend_contig(req, object, pindex, domain, + npages, low, high, alignment, boundary, mpred)) != NULL) { + domain = vm_phys_domain(m_ret); + vmd = VM_DOMAIN(domain); + goto found; + } +#endif m_ret = NULL; - mtx_lock(&vm_page_queue_free_mtx); - if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved || - (req_class == VM_ALLOC_SYSTEM && - vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) || - (req_class == VM_ALLOC_INTERRUPT && - vm_cnt.v_free_count >= npages)) { + vmd = VM_DOMAIN(domain); + vm_domain_free_lock(vmd); + if (vm_domain_available(vmd, req, npages)) { /* * Can we allocate the pages from a reservation? */ #if VM_NRESERVLEVEL > 0 retry: - if (object == NULL || (object->flags & OBJ_COLORED) == 0 || + if (!vm_object_reserv(object) || (m_ret = vm_reserv_alloc_contig(object, pindex, domain, npages, low, high, alignment, boundary, mpred)) == NULL) #endif @@ -1923,12 +1950,15 @@ #endif } if (m_ret == NULL) { - if (vm_page_alloc_fail(object, req)) + if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } - vm_phys_freecnt_adj(m_ret, -npages); - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_freecnt_adj(vmd, -npages); + vm_domain_free_unlock(vmd); +#if VM_NRESERVLEVEL > 0 +found: +#endif for (m = m_ret; m < &m_ret[npages]; m++) vm_page_alloc_check(m); @@ -1964,7 +1994,7 @@ m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - pagedaemon_wakeup(); + pagedaemon_wakeup(domain); if ((req & VM_ALLOC_WIRED) != 0) atomic_subtract_int( &vm_cnt.v_wire_count, npages); @@ -1994,8 +2024,9 @@ pmap_page_set_memattr(m, memattr); pindex++; } - if (vm_paging_needed(vm_cnt.v_free_count)) - pagedaemon_wakeup(); + vmd = VM_DOMAIN(domain); + if (vm_paging_needed(vmd, vmd->vmd_free_count)) + pagedaemon_wakeup(domain); return (m_ret); } @@ -2057,37 +2088,26 @@ vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { + struct vm_domain *vmd; vm_page_t m; u_int flags, free_count; - int req_class; - req_class = req & VM_ALLOC_CLASS_MASK; - /* - * The page daemon is allowed to dig deeper into the free page list. - */ - if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) - req_class = VM_ALLOC_SYSTEM; - - /* * Do not allocate reserved pages unless the req has asked for it. */ + vmd = VM_DOMAIN(domain); again: - mtx_lock(&vm_page_queue_free_mtx); - if (vm_cnt.v_free_count > vm_cnt.v_free_reserved || - (req_class == VM_ALLOC_SYSTEM && - vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) || - (req_class == VM_ALLOC_INTERRUPT && - vm_cnt.v_free_count > 0)) + vm_domain_free_lock(vmd); + if (vm_domain_available(vmd, req, 1)) m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); if (m == NULL) { - if (vm_page_alloc_fail(NULL, req)) + if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } - free_count = vm_phys_freecnt_adj(m, -1); - mtx_unlock(&vm_page_queue_free_mtx); + free_count = vm_domain_freecnt_adj(vmd, -1); + vm_domain_free_unlock(vmd); vm_page_alloc_check(m); /* @@ -2108,8 +2128,8 @@ } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; - if (vm_paging_needed(free_count)) - pagedaemon_wakeup(); + if (vm_paging_needed(vmd, free_count)) + pagedaemon_wakeup(domain); return (m); } @@ -2331,9 +2351,10 @@ * "req_class" must be an allocation class. */ static int -vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, +vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { + struct vm_domain *vmd; struct mtx *m_mtx; struct spglist free; vm_object_t object; @@ -2483,7 +2504,9 @@ unlock: VM_OBJECT_WUNLOCK(object); } else { - mtx_lock(&vm_page_queue_free_mtx); + MPASS(vm_phys_domain(m) == domain); + vmd = VM_DOMAIN(domain); + vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* @@ -2500,7 +2523,7 @@ else if (vm_reserv_is_page_free(m)) order = 0; #endif - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } @@ -2508,13 +2531,15 @@ if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { - mtx_lock(&vm_page_queue_free_mtx); + vmd = VM_DOMAIN(domain); + vm_domain_free_lock(vmd); do { + MPASS(vm_phys_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_page_free_phys(m); } while ((m = SLIST_FIRST(&free)) != NULL); - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_wakeup(vmd); + vm_domain_free_unlock(vmd); } return (error); } @@ -2554,6 +2579,7 @@ vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { + struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; @@ -2574,9 +2600,10 @@ * Return if the number of free pages cannot satisfy the requested * allocation. */ - count = vm_cnt.v_free_count; - if (count < npages + vm_cnt.v_free_reserved || (count < npages + - vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || + vmd = VM_DOMAIN(domain); + count = vmd->vmd_free_count; + if (count < npages + vmd->vmd_free_reserved || (count < npages + + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); @@ -2612,8 +2639,8 @@ for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; - error = vm_page_reclaim_run(req_class, npages, m_run, - high); + error = vm_page_reclaim_run(req_class, domain, npages, + m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) @@ -2653,66 +2680,190 @@ return (ret); } +/* + * Set the domain in the appropriate page level domainset. + */ +void +vm_domain_set(struct vm_domain *vmd) +{ + mtx_lock(&vm_domainset_lock); + if (!vmd->vmd_minset && vm_paging_min(vmd)) { + vmd->vmd_minset = 1; + DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); + } + if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { + vmd->vmd_severeset = 1; + DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); + } + mtx_unlock(&vm_domainset_lock); +} + /* - * vm_wait: (also see VM_WAIT macro) + * Clear the domain from the appropriate page level domainset. + */ +static void +vm_domain_clear(struct vm_domain *vmd) +{ + + mtx_lock(&vm_domainset_lock); + if (vmd->vmd_minset && !vm_paging_min(vmd)) { + vmd->vmd_minset = 0; + DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); + if (vm_min_waiters != 0) { + vm_min_waiters = 0; + wakeup(&vm_min_domains); + } + } + if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { + vmd->vmd_severeset = 0; + DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); + if (vm_severe_waiters != 0) { + vm_severe_waiters = 0; + wakeup(&vm_severe_domains); + } + } + mtx_unlock(&vm_domainset_lock); +} + +/* + * Wait for free pages to exceed the min threshold globally. + */ +void +vm_wait_min(void) +{ + + mtx_lock(&vm_domainset_lock); + while (vm_page_count_min()) { + vm_min_waiters++; + msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); + } + mtx_unlock(&vm_domainset_lock); +} + +/* + * Wait for free pages to exceed the severe threshold globally. + */ +void +vm_wait_severe(void) +{ + + mtx_lock(&vm_domainset_lock); + while (vm_page_count_severe()) { + vm_severe_waiters++; + msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); + } + mtx_unlock(&vm_domainset_lock); +} + +u_int +vm_wait_count(void) +{ + u_int cnt; + int i; + + cnt = 0; + for (i = 0; i < vm_ndomains; i++) + cnt += VM_DOMAIN(i)->vmd_waiters; + cnt += vm_severe_waiters + vm_min_waiters; + + return (cnt); +} + +/* + * vm_wait_domain: * * Sleep until free pages are available for allocation. - * - Called in various places before memory allocations. + * - Called in various places after failed memory allocations. */ -static void -_vm_wait(void) +void +vm_wait_domain(int domain) { + struct vm_domain *vmd; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vmd = VM_DOMAIN(domain); + vm_domain_free_assert_locked(vmd); + if (curproc == pageproc) { - vm_pageout_pages_needed = 1; - msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, - PDROP | PSWP, "VMWait", 0); + vmd->vmd_pageout_pages_needed = 1; + msleep(&vmd->vmd_pageout_pages_needed, + vm_domain_free_lockptr(vmd), PDROP | PSWP, "VMWait", 0); } else { if (pageproc == NULL) panic("vm_wait in early boot"); - pagedaemon_wait(PVM, "vmwait"); + pagedaemon_wait(domain, PVM, "vmwait"); } } +/* + * vm_wait: (also see VM_WAIT macro) + * + * Sleep until free pages are available for allocation. + * - Called in various places after failed memory allocations. + */ void vm_wait(void) { - mtx_lock(&vm_page_queue_free_mtx); - _vm_wait(); + /* + * We use racey wakeup synchronization to avoid expensive global + * locking for the pageproc when sleeping with a non-specific vm_wait. + * To handle this, we only sleep for one tick in this instance. It + * is expected that most allocations for the pageproc will come from + * kmem or vm_page_grab* which will use the more specific and + * race-free vm_wait_domain(). + */ + if (curproc == pageproc) { + mtx_lock(&vm_domainset_lock); + vm_pageproc_waiters++; + msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM, + "pageprocwait", 1); + mtx_unlock(&vm_domainset_lock); + } else { + /* + * XXX Ideally we would wait only until the allocation could + * be satisfied. This condition can cause new allocators to + * consume all freed pages while old allocators wait. + */ + mtx_lock(&vm_domainset_lock); + if (vm_page_count_min()) { + vm_min_waiters++; + msleep(&vm_min_domains, &vm_domainset_lock, PVM, + "vmwait", 0); + } + mtx_unlock(&vm_domainset_lock); + } } /* - * vm_page_alloc_fail: + * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the - * page_queue_free and object lock on entry. Returns with the + * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int -vm_page_alloc_fail(vm_object_t object, int req) +vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(vmd); - atomic_add_int(&vm_pageout_deficit, + atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); - _vm_wait(); + vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } else { - mtx_unlock(&vm_page_queue_free_mtx); - pagedaemon_wakeup(); + vm_domain_free_unlock(vmd); + pagedaemon_wakeup(vmd->vmd_domain); } return (0); } @@ -2731,18 +2882,19 @@ vm_waitpfault(void) { - mtx_lock(&vm_page_queue_free_mtx); - pagedaemon_wait(PUSER, "pfault"); + mtx_lock(&vm_domainset_lock); + if (vm_page_count_min()) { + vm_min_waiters++; + msleep(&vm_min_domains, &vm_domainset_lock, PUSER, "pfault", 0); + } + mtx_unlock(&vm_domainset_lock); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { - if (vm_page_in_laundry(m)) - return (&vm_dom[0].vmd_pagequeues[m->queue]); - else - return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); + return (&vm_pagequeue_domain(m)->vmd_pagequeues[m->queue]); } /* @@ -2804,10 +2956,7 @@ KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); - if (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE) - pq = &vm_dom[0].vmd_pagequeues[queue]; - else - pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; + pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); @@ -2889,7 +3038,7 @@ } /* - * vm_page_free_wakeup: + * vm_domain_free_wakeup: * * Helper routine for vm_page_free_toq(). This routine is called * when a page is added to the free queues. @@ -2897,28 +3046,39 @@ * The page queues must be locked. */ static void -vm_page_free_wakeup(void) +vm_domain_free_wakeup(struct vm_domain *vmd) { - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(vmd); + /* * if pageout daemon needs pages, then tell it that there are * some free. */ - if (vm_pageout_pages_needed && - vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { - wakeup(&vm_pageout_pages_needed); - vm_pageout_pages_needed = 0; + if (vmd->vmd_pageout_pages_needed && + vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { + wakeup(&vmd->vmd_pageout_pages_needed); + vmd->vmd_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ - if (vm_pages_needed && !vm_page_count_min()) { - vm_pages_needed = false; - wakeup(&vm_cnt.v_free_count); + if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) { + vmd->vmd_pages_needed = false; + wakeup(&vmd->vmd_free_count); } + if ((vmd->vmd_minset && !vm_paging_min(vmd)) || + (vmd->vmd_severeset && !vm_paging_severe(vmd))) + vm_domain_clear(vmd); + + /* See comments in vm_wait(); */ + if (vm_pageproc_waiters) { + vm_pageproc_waiters = 0; + wakeup(&vm_pageproc_waiters); + } + } /* @@ -3008,9 +3168,9 @@ vm_page_free_phys(vm_page_t m) { - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(vm_pagequeue_domain(m)); - vm_phys_freecnt_adj(m, 1); + vm_domain_freecnt_adj(vm_pagequeue_domain(m), 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #endif @@ -3020,15 +3180,27 @@ void vm_page_free_phys_pglist(struct pglist *tq) { + struct vm_domain *vmd; vm_page_t m; if (TAILQ_EMPTY(tq)) return; - mtx_lock(&vm_page_queue_free_mtx); - TAILQ_FOREACH(m, tq, listq) + vmd = NULL; + TAILQ_FOREACH(m, tq, listq) { + if (vmd != vm_pagequeue_domain(m)) { + if (vmd != NULL) { + vm_domain_free_wakeup(vmd); + vm_domain_free_unlock(vmd); + } + vmd = vm_pagequeue_domain(m); + vm_domain_free_lock(vmd); + } vm_page_free_phys(m); - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); + } + if (vmd != NULL) { + vm_domain_free_wakeup(vmd); + vm_domain_free_unlock(vmd); + } } /* @@ -3043,13 +3215,15 @@ void vm_page_free_toq(vm_page_t m) { + struct vm_domain *vmd; if (!vm_page_free_prep(m, false)) return; - mtx_lock(&vm_page_queue_free_mtx); + vmd = vm_pagequeue_domain(m); + vm_domain_free_lock(vmd); vm_page_free_phys(m); - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_wakeup(vmd); + vm_domain_free_unlock(vmd); } /* @@ -3160,7 +3334,7 @@ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { - pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; + pq = &vm_pagequeue_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); @@ -3172,8 +3346,9 @@ } m->queue = PQ_INACTIVE; if (noreuse) - TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, - m, plinks.q); + TAILQ_INSERT_BEFORE( + &vm_pagequeue_domain(m)->vmd_inacthead, m, + plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); @@ -3950,10 +4125,10 @@ DB_SHOW_COMMAND(page, vm_page_print_page_info) { - db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); - db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); - db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); - db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); + db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); + db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); + db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); + db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); @@ -3965,7 +4140,7 @@ { int dom; - db_printf("pq_free %d\n", vm_cnt.v_free_count); + db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", Index: sys/vm/vm_pageout.h =================================================================== --- sys/vm/vm_pageout.h +++ sys/vm/vm_pageout.h @@ -74,9 +74,7 @@ */ extern int vm_page_max_wired; -extern int vm_pageout_deficit; extern int vm_pageout_page_count; -extern bool vm_pages_needed; #define VM_OOM_MEM 1 #define VM_OOM_SWAPZ 2 @@ -95,12 +93,15 @@ * Signal pageout-daemon and wait for it. */ -void pagedaemon_wait(int pri, const char *wmesg); -void pagedaemon_wakeup(void); +void pagedaemon_wait(int domain, int pri, const char *wmesg); +void pagedaemon_wakeup(int domain); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() void vm_wait(void); void vm_waitpfault(void); +void vm_wait_domain(int domain); +void vm_wait_min(void); +void vm_wait_severe(void); #ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *); Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -110,6 +110,7 @@ #include #include #include +#include #include #include #include @@ -147,20 +148,8 @@ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 2 -int vm_pageout_deficit; /* Estimated number of pages deficit */ -u_int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; -static bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ -bool vm_pages_needed; /* Are threads waiting for free pages? */ -/* Pending request for dirty page laundering. */ -static enum { - VM_LAUNDRY_IDLE, - VM_LAUNDRY_BACKGROUND, - VM_LAUNDRY_SHORTFALL -} vm_laundry_request = VM_LAUNDRY_IDLE; -static int vm_inactq_scans; - static int vm_pageout_update_period; static int disable_swap_pageouts; static int lowmem_period = 10; @@ -173,10 +162,6 @@ CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); -SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, - CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0, - "free page threshold for waking up the pageout daemon"); - SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); @@ -200,11 +185,6 @@ &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); -static u_int vm_background_launder_target; -SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN, - &vm_background_launder_target, 0, - "background laundering target, in pages"); - static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, @@ -959,18 +939,18 @@ static void vm_pageout_laundry_worker(void *arg) { - struct vm_domain *domain; + struct vm_domain *vmd; struct vm_pagequeue *pq; uint64_t nclean, ndirty; u_int inactq_scans, last_launder; - int domidx, last_target, launder, shortfall, shortfall_cycle, target; + int domain, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; - domidx = (uintptr_t)arg; - domain = &vm_dom[domidx]; - pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; - KASSERT(domain->vmd_segs != 0, ("domain without segments")); - vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); + domain = (uintptr_t)arg; + vmd = VM_DOMAIN(domain); + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + KASSERT(vmd->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&vmd->vmd_laundry_marker, PQ_LAUNDRY); shortfall = 0; in_shortfall = false; @@ -982,9 +962,9 @@ /* * Calls to these handlers are serialized by the swap syscall lock. */ - (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, domain, + (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd, EVENTHANDLER_PRI_ANY); - (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, domain, + (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd, EVENTHANDLER_PRI_ANY); /* @@ -1006,7 +986,7 @@ target = shortfall; } else if (!in_shortfall) goto trybackground; - else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { + else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run @@ -1040,11 +1020,12 @@ * memory pressure required to trigger laundering decreases. */ trybackground: - nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; - ndirty = vm_cnt.v_laundry_count; + nclean = vmd->vmd_free_count + + vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt; + ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt; if (target == 0 && inactq_scans != last_launder && ndirty * isqrt(inactq_scans - last_launder) >= nclean) { - target = vm_background_launder_target; + target = vmd->vmd_background_launder_target; } /* @@ -1076,7 +1057,7 @@ * pages could exceed "target" by the maximum size of * a cluster minus one. */ - target -= min(vm_pageout_launder(domain, launder, + target -= min(vm_pageout_launder(vmd, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } @@ -1087,8 +1068,8 @@ * kicks us. */ vm_pagequeue_lock(pq); - if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) - (void)mtx_sleep(&vm_laundry_request, + if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE) + (void)mtx_sleep(&vmd->vmd_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* @@ -1096,16 +1077,17 @@ * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ - if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && + if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { - shortfall = vm_laundry_target() + vm_pageout_deficit; + shortfall = vm_laundry_target(vmd) + + vmd->vmd_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) - vm_laundry_request = VM_LAUNDRY_IDLE; - inactq_scans = vm_inactq_scans; + vmd->vmd_laundry_request = VM_LAUNDRY_IDLE; + inactq_scans = vmd->vmd_inactq_scans; vm_pagequeue_unlock(pq); } } @@ -1134,7 +1116,7 @@ * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ - if (vmd == &vm_dom[0] && pass > 0 && + if (vmd == VM_DOMAIN(0) && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. @@ -1163,8 +1145,8 @@ * the page daemon and this calculation. */ if (pass > 0) { - deficit = atomic_readandclear_int(&vm_pageout_deficit); - page_shortage = vm_paging_target() + deficit; + deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); + page_shortage = vm_paging_target(vmd) + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; @@ -1357,18 +1339,20 @@ * keep count. */ if (starting_page_shortage > 0) { - pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); - if (vm_laundry_request == VM_LAUNDRY_IDLE && + if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE && (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { if (page_shortage > 0) { - vm_laundry_request = VM_LAUNDRY_SHORTFALL; + vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL; VM_CNT_INC(v_pdshortfalls); - } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) - vm_laundry_request = VM_LAUNDRY_BACKGROUND; - wakeup(&vm_laundry_request); + } else if (vmd->vmd_laundry_request != + VM_LAUNDRY_SHORTFALL) + vmd->vmd_laundry_request = + VM_LAUNDRY_BACKGROUND; + wakeup(&vmd->vmd_laundry_request); } - vm_inactq_scans++; + vmd->vmd_inactq_scans++; vm_pagequeue_unlock(pq); } @@ -1397,9 +1381,9 @@ * more aggressively, improving the effectiveness of clustering and * ensuring that they can eventually be reused. */ - inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + - vm_cnt.v_laundry_count / act_scan_laundry_weight) + - vm_paging_target() + deficit + addl_page_shortage; + inactq_shortage = vmd->vmd_inactive_target - (pq->pq_cnt + + vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight) + + vm_paging_target(vmd) + deficit + addl_page_shortage; inactq_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; @@ -1742,6 +1726,8 @@ } sx_sunlock(&allproc_lock); if (bigproc != NULL) { + int i; + if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); @@ -1749,19 +1735,20 @@ sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); - wakeup(&vm_cnt.v_free_count); + for (i = 0; i < vm_ndomains; i++) + wakeup(&VM_DOMAIN(i)->vmd_free_count); } } static void vm_pageout_worker(void *arg) { - struct vm_domain *domain; - int domidx, pass; + struct vm_domain *vmd; + int domain, pass; bool target_met; - domidx = (uintptr_t)arg; - domain = &vm_dom[domidx]; + domain = (uintptr_t)arg; + vmd = VM_DOMAIN(domain); pass = 0; target_met = true; @@ -1771,18 +1758,18 @@ * is allocated. */ - KASSERT(domain->vmd_segs != 0, ("domain without segments")); - domain->vmd_last_active_scan = ticks; - vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); - vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); - TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, - &domain->vmd_inacthead, plinks.q); + KASSERT(vmd->vmd_segs != 0, ("domain without segments")); + vmd->vmd_last_active_scan = ticks; + vm_pageout_init_marker(&vmd->vmd_marker, PQ_INACTIVE); + vm_pageout_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE); + TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, + &vmd->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { - mtx_lock(&vm_page_queue_free_mtx); + vm_domain_free_lock(vmd); /* * Generally, after a level >= 1 scan, if there are enough @@ -1796,34 +1783,34 @@ * thread will, nonetheless, wait until another page is freed * or this wakeup is performed. */ - if (vm_pages_needed && !vm_page_count_min()) { - vm_pages_needed = false; - wakeup(&vm_cnt.v_free_count); + if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) { + vmd->vmd_pages_needed = false; + wakeup(&vmd->vmd_free_count); } /* - * Do not clear vm_pageout_wanted until we reach our free page + * Do not clear vmd_pageout_wanted until we reach our free page * target. Otherwise, we may be awakened over and over again, * wasting CPU time. */ - if (vm_pageout_wanted && target_met) - vm_pageout_wanted = false; + if (vmd->vmd_pageout_wanted && target_met) + vmd->vmd_pageout_wanted = false; /* * Might the page daemon receive a wakeup call? */ - if (vm_pageout_wanted) { + if (vmd->vmd_pageout_wanted) { /* - * No. Either vm_pageout_wanted was set by another + * No. Either vmd_pageout_wanted was set by another * thread during the previous scan, which must have - * been a level 0 scan, or vm_pageout_wanted was + * been a level 0 scan, or vmd_pageout_wanted was * already set and the scan failed to free enough * pages. If we haven't yet performed a level >= 1 * (page reclamation) scan, then increase the level * and scan again now. Otherwise, sleep a bit and * try again later. */ - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_unlock(vmd); if (pass >= 1) pause("pwait", hz / VM_INACT_SCAN_RATE); pass++; @@ -1834,20 +1821,20 @@ * sleep until the next wakeup or until pages need to * have their reference stats updated. */ - if (vm_pages_needed) { - mtx_unlock(&vm_page_queue_free_mtx); + if (vmd->vmd_pages_needed) { + vm_domain_free_unlock(vmd); if (pass == 0) pass++; - } else if (mtx_sleep(&vm_pageout_wanted, - &vm_page_queue_free_mtx, PDROP | PVM, "psleep", - hz) == 0) { + } else if (mtx_sleep(&vmd->vmd_pageout_wanted, + vm_domain_free_lockptr(vmd), PDROP | PVM, + "psleep", hz) == 0) { VM_CNT_INC(v_pdwakeups); pass = 1; } else pass = 0; } - target_met = vm_pageout_scan(domain, pass); + target_met = vm_pageout_scan(vmd, pass); } } @@ -1855,43 +1842,78 @@ * vm_pageout_init initialises basic pageout daemon settings. */ static void -vm_pageout_init(void) +vm_pageout_init_domain(int domain) { - /* - * Initialize some paging parameters. - */ - vm_cnt.v_interrupt_free_min = 2; - if (vm_cnt.v_page_count < 2000) - vm_pageout_page_count = 8; + struct vm_domain *vmd; + vmd = VM_DOMAIN(domain); + vmd->vmd_interrupt_free_min = 2; + /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ - if (vm_cnt.v_page_count > 1024) - vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; + if (vmd->vmd_page_count > 1024) + vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200; else - vm_cnt.v_free_min = 4; - vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + - vm_cnt.v_interrupt_free_min; - vm_cnt.v_free_reserved = vm_pageout_page_count + - vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); - vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; - vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; - vm_cnt.v_free_min += vm_cnt.v_free_reserved; - vm_cnt.v_free_severe += vm_cnt.v_free_reserved; - vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; - if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) - vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; + vmd->vmd_free_min = 4; + vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + + vmd->vmd_interrupt_free_min; + vmd->vmd_free_reserved = vm_pageout_page_count + + vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768); + vmd->vmd_free_severe = vmd->vmd_free_min / 2; + vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved; + vmd->vmd_free_min += vmd->vmd_free_reserved; + vmd->vmd_free_severe += vmd->vmd_free_reserved; + vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2; + if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3) + vmd->vmd_inactive_target = vmd->vmd_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ - vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; + vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_min / 10) * 11; /* + * Target amount of memory to move out of the laundry queue during a + * background laundering. This is proportional to the amount of system + * memory. + */ + vmd->vmd_background_launder_target = (vmd->vmd_free_target - + vmd->vmd_free_min) / 10; +} + +static void +vm_pageout_init(void) +{ + u_int freecount; + int i; + + /* + * Initialize some paging parameters. + */ + if (vm_cnt.v_page_count < 2000) + vm_pageout_page_count = 8; + + freecount = 0; + for (i = 0; i < vm_ndomains; i++) { + struct vm_domain *vmd; + + vm_pageout_init_domain(i); + vmd = VM_DOMAIN(i); + vm_cnt.v_free_reserved += vmd->vmd_free_reserved; + vm_cnt.v_free_target += vmd->vmd_free_target; + vm_cnt.v_free_min += vmd->vmd_free_min; + vm_cnt.v_inactive_target += vmd->vmd_inactive_target; + vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min; + vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min; + vm_cnt.v_free_severe += vmd->vmd_free_severe; + freecount += vmd->vmd_free_count; + } + + /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. @@ -1899,17 +1921,8 @@ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; - /* XXX does not really belong here */ if (vm_page_max_wired == 0) - vm_page_max_wired = vm_cnt.v_free_count / 3; - - /* - * Target amount of memory to move out of the laundry queue during a - * background laundering. This is proportional to the amount of system - * memory. - */ - vm_background_launder_target = (vm_cnt.v_free_target - - vm_cnt.v_free_min) / 10; + vm_page_max_wired = freecount / 3; } /* @@ -1933,6 +1946,12 @@ panic("starting pageout for domain %d, error %d\n", i, error); } + error = kthread_add(vm_pageout_laundry_worker, + (void *)(uintptr_t)i, curproc, NULL, 0, 0, + "laundry: dom%d", i); + if (error != 0) + panic("starting laundry for domain %d, error %d", + i, error); } error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); @@ -1945,14 +1964,16 @@ * Perform an advisory wakeup of the page daemon. */ void -pagedaemon_wakeup(void) +pagedaemon_wakeup(int domain) { + struct vm_domain *vmd; - mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED); + vmd = VM_DOMAIN(domain); + vm_domain_free_assert_unlocked(vmd); - if (!vm_pageout_wanted && curthread->td_proc != pageproc) { - vm_pageout_wanted = true; - wakeup(&vm_pageout_wanted); + if (!vmd->vmd_pageout_wanted && curthread->td_proc != pageproc) { + vmd->vmd_pageout_wanted = true; + wakeup(&vmd->vmd_pageout_wanted); } } @@ -1962,22 +1983,26 @@ * This function returns with the free queues mutex unlocked. */ void -pagedaemon_wait(int pri, const char *wmesg) +pagedaemon_wait(int domain, int pri, const char *wmesg) { + struct vm_domain *vmd; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vmd = VM_DOMAIN(domain); + vm_domain_free_assert_locked(vmd); /* - * vm_pageout_wanted may have been set by an advisory wakeup, but if the - * page daemon is running on a CPU, the wakeup will have been lost. + * vmd_pageout_wanted may have been set by an advisory wakeup, but if + * the page daemon is running on a CPU, the wakeup will have been lost. * Thus, deliver a potentially spurious wakeup to ensure that the page * daemon has been notified of the shortage. */ - if (!vm_pageout_wanted || !vm_pages_needed) { - vm_pageout_wanted = true; - wakeup(&vm_pageout_wanted); + if (!vmd->vmd_pageout_wanted || !vmd->vmd_pages_needed) { + vmd->vmd_pageout_wanted = true; + wakeup(&vmd->vmd_pageout_wanted); } - vm_pages_needed = true; - msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri, + vmd->vmd_pages_needed = true; + vmd->vmd_waiters++; + msleep(&vmd->vmd_free_count, vm_domain_free_lockptr(vmd), PDROP | pri, wmesg, 0); + vmd->vmd_waiters--; } Index: sys/vm/vm_pagequeue.h =================================================================== --- sys/vm/vm_pagequeue.h +++ sys/vm/vm_pagequeue.h @@ -0,0 +1,235 @@ +/*- + * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + * + * $FreeBSD$ + */ + +#ifndef _VM_PAGEQUEUE_ +#define _VM_PAGEQUEUE_ + +#ifdef _KERNEL +struct vm_pagequeue { + struct mtx pq_mutex; + struct pglist pq_pl; + int pq_cnt; + const char * const pq_name; +} __aligned(CACHE_LINE_SIZE); + + +struct vm_domain { + struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; + struct mtx_padalign vmd_free_mtx; + struct vmem *vmd_kernel_arena; + u_int vmd_domain; /* Domain number. */ + u_int vmd_page_count; + long vmd_segs; /* bitmask of the segments */ + + /* Paging control variables, locked by domain_free_mtx. */ + u_int vmd_free_count; + boolean_t vmd_oom; + int vmd_oom_seq; + int vmd_last_active_scan; + struct vm_page vmd_laundry_marker; + struct vm_page vmd_marker; /* marker for pagedaemon private use */ + struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ + + int vmd_pageout_pages_needed; /* page daemon waiting for pages? */ + int vmd_pageout_deficit; /* Estimated number of pages deficit */ + int vmd_waiters; /* Pageout waiters. */ + bool vmd_pages_needed; /* Are threads waiting for free pages? */ + bool vmd_pageout_wanted; /* pageout daemon wait channel */ + bool vmd_minset; /* Are we in vm_min_domains? */ + bool vmd_severeset; /* Are we in vm_severe_domains? */ + int vmd_inactq_scans; + enum { + VM_LAUNDRY_IDLE = 0, + VM_LAUNDRY_BACKGROUND, + VM_LAUNDRY_SHORTFALL + } vmd_laundry_request; + + /* Paging thresholds. */ + u_int vmd_background_launder_target; + u_int vmd_free_reserved; /* (c) pages reserved for deadlock */ + u_int vmd_free_target; /* (c) pages desired free */ + u_int vmd_free_min; /* (c) pages desired free */ + u_int vmd_inactive_target; /* (c) pages desired inactive */ + u_int vmd_pageout_free_min; /* (c) min pages reserved for kernel */ + u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */ + u_int vmd_interrupt_free_min; /* (c) reserved pages for int code */ + u_int vmd_free_severe; /* (c) severe page depletion point */ +} __aligned(CACHE_LINE_SIZE); + +extern struct vm_domain vm_dom[MAXMEMDOM]; + +#define VM_DOMAIN(n) (&vm_dom[(n)]) + +#define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) +#define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) +#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) +#define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) + +#define vm_domain_free_assert_locked(n) \ + mtx_assert(vm_domain_free_lockptr((n)), MA_OWNED) +#define vm_domain_free_assert_unlocked(n) \ + mtx_assert(vm_domain_free_lockptr((n)), MA_NOTOWNED) +#define vm_domain_free_lock(d) \ + mtx_lock(vm_domain_free_lockptr((d))) +#define vm_domain_free_lockptr(d) \ + (&(d)->vmd_free_mtx) +#define vm_domain_free_unlock(d) \ + mtx_unlock(vm_domain_free_lockptr((d))) + +static __inline void +vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) +{ + +#ifdef notyet + vm_pagequeue_assert_locked(pq); +#endif + pq->pq_cnt += addend; +} +#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) +#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) + +void vm_domain_set(struct vm_domain *vmd); +int vm_domain_available(struct vm_domain *vmd, int req, int npages); + +/* + * vm_pagequeue_domain: + * + * Return the memory domain the page belongs to. + */ +static inline struct vm_domain * +vm_pagequeue_domain(vm_page_t m) +{ + + return (VM_DOMAIN(vm_phys_domain(m))); +} + +/* + * Return the number of pages we need to free-up or cache + * A positive number indicates that we do not have enough free pages. + */ +static inline int +vm_paging_target(struct vm_domain *vmd) +{ + + return (vmd->vmd_free_target - vmd->vmd_free_count); +} + +/* + * Returns TRUE if the pagedaemon needs to be woken up. + */ +static inline int +vm_paging_needed(struct vm_domain *vmd, u_int free_count) +{ + + return (free_count < vmd->vmd_pageout_wakeup_thresh); +} + +/* + * Returns TRUE if the domain is below the min paging target. + */ +static inline int +vm_paging_min(struct vm_domain *vmd) +{ + + return (vmd->vmd_free_min > vmd->vmd_free_count); +} + +/* + * Returns TRUE if the domain is below the severe paging target. + */ +static inline int +vm_paging_severe(struct vm_domain *vmd) +{ + + return (vmd->vmd_free_severe > vmd->vmd_free_count); +} + +/* + * Return the number of pages we need to launder. + * A positive number indicates that we have a shortfall of clean pages. + */ +static inline int +vm_laundry_target(struct vm_domain *vmd) +{ + + return (vm_paging_target(vmd)); +} + +static inline u_int +vm_domain_freecnt_adj(struct vm_domain *vmd, int adj) +{ + u_int ret; + + vm_domain_free_assert_locked(vmd); + ret = vmd->vmd_free_count += adj; + if ((!vmd->vmd_minset && vm_paging_min(vmd)) || + (!vmd->vmd_severeset && vm_paging_severe(vmd))) + vm_domain_set(vmd); + + return (ret); +} + + +#endif /* _KERNEL */ +#endif /* !_VM_PAGEQUEUE_ */ Index: sys/vm/vm_phys.h =================================================================== --- sys/vm/vm_phys.h +++ sys/vm/vm_phys.h @@ -96,12 +96,12 @@ /* * - * vm_phys_domidx: + * vm_phys_domain: * * Return the index of the domain the page belongs to. */ static inline int -vm_phys_domidx(vm_page_t m) +vm_phys_domain(vm_page_t m) { #ifdef NUMA int domn, segind; @@ -115,27 +115,6 @@ #else return (0); #endif -} - -/* - * vm_phys_domain: - * - * Return the memory domain the page belongs to. - */ -static inline struct vm_domain * -vm_phys_domain(vm_page_t m) -{ - - return (&vm_dom[vm_phys_domidx(m)]); -} - -static inline u_int -vm_phys_freecnt_adj(vm_page_t m, int adj) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - vm_phys_domain(m)->vmd_free_count += adj; - return (vm_cnt.v_free_count += adj); } #endif /* _KERNEL */ Index: sys/vm/vm_phys.c =================================================================== --- sys/vm/vm_phys.c +++ sys/vm/vm_phys.c @@ -67,6 +67,7 @@ #include #include #include +#include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); @@ -653,7 +654,7 @@ if (flind < 0) return (NULL); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(domain)); fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); @@ -906,8 +907,8 @@ m, m->pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_free_pages: order %d is out of range", order)); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); seg = &vm_phys_segs[m->segind]; + vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); if (order < VM_NFREEORDER - 1) { pa = VM_PAGE_TO_PHYS(m); do { @@ -945,7 +946,7 @@ * Avoid unnecessary coalescing by freeing the pages in the largest * possible power-of-two-sized subsets. */ - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(vm_pagequeue_domain(m)); for (;; npages -= n) { /* * Unsigned "min" is used here so that "order" is assigned @@ -1051,14 +1052,13 @@ vm_page_t m_set, m_tmp; int order; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ seg = &vm_phys_segs[m->segind]; + vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; @@ -1122,7 +1122,7 @@ KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(domain)); if (low >= high) return (NULL); m_run = NULL; @@ -1167,7 +1167,7 @@ KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); /* Search for a run satisfying the specified conditions. */ Index: sys/vm/vm_reserv.h =================================================================== --- sys/vm/vm_reserv.h +++ sys/vm/vm_reserv.h @@ -50,8 +50,14 @@ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_page_t mpred); +vm_page_t vm_reserv_extend_contig(int req, vm_object_t object, + vm_pindex_t pindex, int domain, u_long npages, + vm_paddr_t low, vm_paddr_t high, u_long alignment, + vm_paddr_t boundary, vm_page_t mpred); vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain, vm_page_t mpred); +vm_page_t vm_reserv_extend(int req, vm_object_t object, + vm_pindex_t pindex, int domain, vm_page_t mpred); void vm_reserv_break_all(vm_object_t object); boolean_t vm_reserv_free_page(vm_page_t m); void vm_reserv_init(void); Index: sys/vm/vm_reserv.c =================================================================== --- sys/vm/vm_reserv.c +++ sys/vm/vm_reserv.c @@ -59,7 +59,9 @@ #include #include #include +#include #include +#include #include #include @@ -163,17 +165,21 @@ * object's list of reservations. * * A partially populated reservation can be broken and reclaimed at any time. + * + * f - vm_domain_free_lock + * o - vm_reserv_object_lock + * c - constant after boot */ struct vm_reserv { - TAILQ_ENTRY(vm_reserv) partpopq; - LIST_ENTRY(vm_reserv) objq; - vm_object_t object; /* containing object */ - vm_pindex_t pindex; /* offset within object */ - vm_page_t pages; /* first page of a superpage */ - int domain; /* NUMA domain */ - int popcnt; /* # of pages in use */ - char inpartpopq; - popmap_t popmap[NPOPMAP]; /* bit vector of used pages */ + TAILQ_ENTRY(vm_reserv) partpopq; /* (f) per-domain queue. */ + LIST_ENTRY(vm_reserv) objq; /* (o, f) object queue */ + vm_object_t object; /* (o, f) containing object */ + vm_pindex_t pindex; /* (o, f) offset in object */ + vm_page_t pages; /* (c) first page */ + int domain; /* (c) NUMA domain. */ + int popcnt; /* (f) # of pages in use */ + char inpartpopq; /* (f) */ + popmap_t popmap[NPOPMAP]; /* (f) bit vector, used pages */ }; /* @@ -234,6 +240,25 @@ SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations"); +/* + * The object lock pool is used to synchronize the rvq. We can not use a + * pool mutex because it is required before malloc works. + * + * The "hash" function could be made faster without divide and modulo. + */ +#define VM_RESERV_OBJ_LOCK_COUNT MAXCPU + +struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT]; + +#define vm_reserv_object_lock_idx(object) \ + (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT) +#define vm_reserv_object_lock_ptr(object) \ + &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))] +#define vm_reserv_object_lock(object) \ + mtx_lock(vm_reserv_object_lock_ptr((object))) +#define vm_reserv_object_unlock(object) \ + mtx_unlock(vm_reserv_object_lock_ptr((object))) + static void vm_reserv_break(vm_reserv_t rv, vm_page_t m); static void vm_reserv_depopulate(vm_reserv_t rv, int index); static vm_reserv_t vm_reserv_from_page(vm_page_t m); @@ -288,12 +313,12 @@ for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { counter = 0; unused_pages = 0; - mtx_lock(&vm_page_queue_free_mtx); + vm_domain_free_lock(VM_DOMAIN(domain)); TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) { counter++; unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; } - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_unlock(VM_DOMAIN(domain)); sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n", domain, level, unused_pages * ((int)PAGE_SIZE / 1024), counter); @@ -305,6 +330,49 @@ } /* + * Remove a reservation from the object's objq. + */ +static void +vm_reserv_remove(vm_reserv_t rv) +{ + vm_object_t object; + + KASSERT(rv->object != NULL, + ("vm_reserv_remove: reserv %p is free", rv)); + KASSERT(!rv->inpartpopq, + ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv)); + object = rv->object; + vm_reserv_object_lock(object); + LIST_REMOVE(rv, objq); + rv->object = NULL; + vm_reserv_object_unlock(object); +} + +/* + * Insert a new reservation into the object's objq. + */ +static void +vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex) +{ + int i; + + KASSERT(rv->object == NULL, + ("vm_reserv_insert: reserv %p isn't free", rv)); + KASSERT(rv->popcnt == 0, + ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv)); + KASSERT(!rv->inpartpopq, + ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv)); + for (i = 0; i < NPOPMAP; i++) + KASSERT(rv->popmap[i] == 0, + ("vm_reserv_insert: reserv %p's popmap is corrupted", rv)); + vm_reserv_object_lock(object); + rv->pindex = pindex; + rv->object = object; + LIST_INSERT_HEAD(&object->rvq, rv, objq); + vm_reserv_object_unlock(object); +} + +/* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the * reservation to the tail of the partially populated reservation queue if the @@ -316,7 +384,7 @@ vm_reserv_depopulate(vm_reserv_t rv, int index) { - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(rv->domain)); KASSERT(rv->object != NULL, ("vm_reserv_depopulate: reserv %p is free", rv)); KASSERT(popmap_is_set(rv->popmap, index), @@ -339,9 +407,7 @@ popmap_clear(rv->popmap, index); rv->popcnt--; if (rv->popcnt == 0) { - LIST_REMOVE(rv, objq); - rv->object = NULL; - rv->domain = -1; + vm_reserv_remove(rv); vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); vm_reserv_freed++; } else { @@ -361,6 +427,43 @@ } /* + * Returns an existing reservation or NULL and initialized successor pointer. + */ +static vm_reserv_t +vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex, + vm_page_t mpred, vm_page_t *msuccp) +{ + vm_reserv_t rv; + vm_page_t msucc; + + msucc = NULL; + if (mpred != NULL) { + KASSERT(mpred->object == object, + ("vm_reserv_from_object: object doesn't contain mpred")); + KASSERT(mpred->pindex < pindex, + ("vm_reserv_from_object: mpred doesn't precede pindex")); + rv = vm_reserv_from_page(mpred); + if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) + goto found; + msucc = TAILQ_NEXT(mpred, listq); + } else + msucc = TAILQ_FIRST(&object->memq); + if (msucc != NULL) { + KASSERT(msucc->pindex > pindex, + ("vm_reserv_from_object: msucc doesn't succeed pindex")); + rv = vm_reserv_from_page(msucc); + if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) + goto found; + } + rv = NULL; + +found: + *msuccp = msucc; + + return (rv); +} + +/* * Returns TRUE if the given reservation contains the given page index and * FALSE otherwise. */ @@ -381,7 +484,7 @@ vm_reserv_populate(vm_reserv_t rv, int index) { - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(rv->domain)); KASSERT(rv->object != NULL, ("vm_reserv_populate: reserv %p is free", rv)); KASSERT(popmap_is_clear(rv->popmap, index), @@ -423,6 +526,100 @@ * The object and free page queue must be locked. */ vm_page_t +vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex, + int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, + u_long alignment, vm_paddr_t boundary, vm_page_t mpred) +{ + struct vm_domain *vmd; + vm_paddr_t pa, size; + vm_page_t m, msucc; + vm_reserv_t rv; + int i, index; + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); + + /* + * Is a reservation fundamentally impossible? + */ + if (pindex < VM_RESERV_INDEX(object, pindex) || + pindex + npages > object->size || object->resident_page_count == 0) + return (NULL); + + /* + * All reservations of a particular size have the same alignment. + * Assuming that the first page is allocated from a reservation, the + * least significant bits of its physical address can be determined + * from its offset from the beginning of the reservation and the size + * of the reservation. + * + * Could the specified index within a reservation of the smallest + * possible size satisfy the alignment and boundary requirements? + */ + pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; + if ((pa & (alignment - 1)) != 0) + return (NULL); + size = npages << PAGE_SHIFT; + if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) + return (NULL); + + /* + * Look for an existing reservation. + */ + rv = vm_reserv_from_object(object, pindex, mpred, &msucc); + if (rv == NULL) + return (NULL); + KASSERT(object != kernel_object || rv->domain == domain, + ("vm_reserv_extend_contig: Domain mismatch from reservation.")); + index = VM_RESERV_INDEX(object, pindex); + /* Does the allocation fit within the reservation? */ + if (index + npages > VM_LEVEL_0_NPAGES) + return (NULL); + domain = rv->domain; + vmd = VM_DOMAIN(domain); + vm_domain_free_lock(vmd); + if (rv->object != object || !vm_domain_available(vmd, req, npages)) { + m = NULL; + goto out; + } + m = &rv->pages[index]; + pa = VM_PAGE_TO_PHYS(m); + if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 || + ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { + m = NULL; + goto out; + } + /* Handle vm_page_rename(m, new_object, ...). */ + for (i = 0; i < npages; i++) { + if (popmap_is_set(rv->popmap, index + i)) { + m = NULL; + goto out; + } + } + for (i = 0; i < npages; i++) + vm_reserv_populate(rv, index + i); + vm_domain_freecnt_adj(vmd, -npages); +out: + vm_domain_free_unlock(vmd); + return (m); +} + +/* + * Allocates a contiguous set of physical pages of the given size "npages" + * from existing or newly created reservations. All of the physical pages + * must be at or above the given physical address "low" and below the given + * physical address "high". The given value "alignment" determines the + * alignment of the first physical page in the set. If the given value + * "boundary" is non-zero, then the set of physical pages cannot cross any + * physical address boundary that is a multiple of that value. Both + * "alignment" and "boundary" must be a power of two. + * + * The page "mpred" must immediately precede the offset "pindex" within the + * specified object. + * + * The object and free page queue must be locked. + */ +vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_page_t mpred) @@ -434,7 +631,7 @@ u_long allocpages, maxpages, minpages; int i, index, n; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(domain)); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); @@ -463,52 +660,48 @@ return (NULL); /* - * Look for an existing reservation. + * Callers should've extended an existing reservation prior to + * calling this function. If a reservation exists it is + * incompatible with the allocation. */ - if (mpred != NULL) { - KASSERT(mpred->object == object, - ("vm_reserv_alloc_contig: object doesn't contain mpred")); - KASSERT(mpred->pindex < pindex, - ("vm_reserv_alloc_contig: mpred doesn't precede pindex")); - rv = vm_reserv_from_page(mpred); - if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) - goto found; - msucc = TAILQ_NEXT(mpred, listq); - } else - msucc = TAILQ_FIRST(&object->memq); - if (msucc != NULL) { - KASSERT(msucc->pindex > pindex, - ("vm_reserv_alloc_contig: msucc doesn't succeed pindex")); - rv = vm_reserv_from_page(msucc); - if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) - goto found; - } + rv = vm_reserv_from_object(object, pindex, mpred, &msucc); + if (rv != NULL) + return (NULL); /* * Could at least one reservation fit between the first index to the * left that can be used ("leftcap") and the first index to the right * that cannot be used ("rightcap")? + * + * We must synchronize with the reserv object lock to protect the + * pindex/object of the resulting reservations against rename while + * we are inspecting. */ first = pindex - VM_RESERV_INDEX(object, pindex); + minpages = VM_RESERV_INDEX(object, pindex) + npages; + maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); + allocpages = maxpages; + vm_reserv_object_lock(object); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; - if (leftcap > first) + if (leftcap > first) { + vm_reserv_object_unlock(object); return (NULL); + } } - minpages = VM_RESERV_INDEX(object, pindex) + npages; - maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); - allocpages = maxpages; if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + maxpages > rightcap) { - if (maxpages == VM_LEVEL_0_NPAGES) + if (maxpages == VM_LEVEL_0_NPAGES) { + vm_reserv_object_unlock(object); return (NULL); + } /* * At least one reservation will fit between "leftcap" @@ -519,6 +712,7 @@ allocpages = minpages; } } + vm_reserv_object_unlock(object); /* * Would the last new reservation extend past the end of the object? @@ -549,7 +743,7 @@ VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0); if (m == NULL) return (NULL); - KASSERT(vm_phys_domidx(m) == domain, + KASSERT(vm_phys_domain(m) == domain, ("vm_reserv_alloc_contig: Page domain does not match requested.")); /* @@ -565,22 +759,7 @@ KASSERT(rv->pages == m, ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", rv)); - KASSERT(rv->object == NULL, - ("vm_reserv_alloc_contig: reserv %p isn't free", rv)); - LIST_INSERT_HEAD(&object->rvq, rv, objq); - rv->object = object; - rv->pindex = first; - rv->domain = domain; - KASSERT(rv->popcnt == 0, - ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted", - rv)); - KASSERT(!rv->inpartpopq, - ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE", - rv)); - for (i = 0; i < NPOPMAP; i++) - KASSERT(rv->popmap[i] == 0, - ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted", - rv)); + vm_reserv_insert(rv, object, first); n = ulmin(VM_LEVEL_0_NPAGES - index, npages); for (i = 0; i < n; i++) vm_reserv_populate(rv, index + i); @@ -594,31 +773,68 @@ allocpages -= VM_LEVEL_0_NPAGES; } while (allocpages >= VM_LEVEL_0_NPAGES); return (m_ret); +} +/* + * Attempts to extend an existing reservation and allocate the page to the + * object. + * + * The page "mpred" must immediately precede the offset "pindex" within the + * specified object. + * + * The object must be locked. + */ +vm_page_t +vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain, + vm_page_t mpred) +{ + struct vm_domain *vmd; + vm_page_t m, msucc; + vm_reserv_t rv; + int index, free_count; + + VM_OBJECT_ASSERT_WLOCKED(object); + /* - * Found a matching reservation. + * Could a reservation currently exist? */ -found: - index = VM_RESERV_INDEX(object, pindex); - /* Does the allocation fit within the reservation? */ - if (index + npages > VM_LEVEL_0_NPAGES) + if (pindex < VM_RESERV_INDEX(object, pindex) || + pindex >= object->size || object->resident_page_count == 0) return (NULL); - m = &rv->pages[index]; - pa = VM_PAGE_TO_PHYS(m); - if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 || - ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) + + /* + * Look for an existing reservation. + */ + rv = vm_reserv_from_object(object, pindex, mpred, &msucc); + if (rv == NULL) return (NULL); - /* Handle vm_page_rename(m, new_object, ...). */ - for (i = 0; i < npages; i++) - if (popmap_is_set(rv->popmap, index + i)) - return (NULL); - for (i = 0; i < npages; i++) - vm_reserv_populate(rv, index + i); + + KASSERT(object != kernel_object || rv->domain == domain, + ("vm_reserv_extend: Domain mismatch from reservation.")); + domain = rv->domain; + vmd = VM_DOMAIN(domain); + index = VM_RESERV_INDEX(object, pindex); + m = &rv->pages[index]; + vm_domain_free_lock(vmd); + if (vm_domain_available(vmd, req, 1) == 0 || + /* Handle reclaim race. */ + rv->object != object || + /* Handle vm_page_rename(m, new_object, ...). */ + popmap_is_set(rv->popmap, index)) + m = NULL; + if (m != NULL) + vm_reserv_populate(rv, index); + free_count = vm_domain_freecnt_adj(vmd, -1); + vm_domain_free_unlock(vmd); + + if (vm_paging_needed(vmd, free_count)) + pagedaemon_wakeup(domain); + return (m); } /* - * Allocates a page from an existing or newly created reservation. + * Allocates a page from an existing reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. @@ -632,9 +848,9 @@ vm_page_t m, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; - int i, index; + int index; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(domain)); VM_OBJECT_ASSERT_WLOCKED(object); /* @@ -645,48 +861,45 @@ return (NULL); /* - * Look for an existing reservation. + * Callers should've extended an existing reservation prior to + * calling this function. If a reservation exists it is + * incompatible with the allocation. */ - if (mpred != NULL) { - KASSERT(mpred->object == object, - ("vm_reserv_alloc_page: object doesn't contain mpred")); - KASSERT(mpred->pindex < pindex, - ("vm_reserv_alloc_page: mpred doesn't precede pindex")); - rv = vm_reserv_from_page(mpred); - if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) - goto found; - msucc = TAILQ_NEXT(mpred, listq); - } else - msucc = TAILQ_FIRST(&object->memq); - if (msucc != NULL) { - KASSERT(msucc->pindex > pindex, - ("vm_reserv_alloc_page: msucc doesn't succeed pindex")); - rv = vm_reserv_from_page(msucc); - if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) - goto found; - } + rv = vm_reserv_from_object(object, pindex, mpred, &msucc); + if (rv != NULL) + return (NULL); /* * Could a reservation fit between the first index to the left that * can be used and the first index to the right that cannot be used? + * + * We must synchronize with the reserv object lock to protect the + * pindex/object of the resulting reservations against rename while + * we are inspecting. */ first = pindex - VM_RESERV_INDEX(object, pindex); + vm_reserv_object_lock(object); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; - if (leftcap > first) + if (leftcap > first) { + vm_reserv_object_unlock(object); return (NULL); + } } if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; - if (first + VM_LEVEL_0_NPAGES > rightcap) + if (first + VM_LEVEL_0_NPAGES > rightcap) { + vm_reserv_object_unlock(object); return (NULL); + } } + vm_reserv_object_unlock(object); /* * Would a new reservation extend past the end of the object? @@ -712,37 +925,10 @@ rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); - KASSERT(rv->object == NULL, - ("vm_reserv_alloc_page: reserv %p isn't free", rv)); - LIST_INSERT_HEAD(&object->rvq, rv, objq); - rv->object = object; - rv->pindex = first; - rv->domain = domain; - KASSERT(rv->popcnt == 0, - ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv)); - KASSERT(!rv->inpartpopq, - ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv)); - for (i = 0; i < NPOPMAP; i++) - KASSERT(rv->popmap[i] == 0, - ("vm_reserv_alloc_page: reserv %p's popmap is corrupted", - rv)); + vm_reserv_insert(rv, object, first); index = VM_RESERV_INDEX(object, pindex); vm_reserv_populate(rv, index); return (&rv->pages[index]); - - /* - * Found a matching reservation. - */ -found: - index = VM_RESERV_INDEX(object, pindex); - m = &rv->pages[index]; - KASSERT(object != kernel_object || vm_phys_domidx(m) == domain, - ("vm_reserv_alloc_page: Domain mismatch from reservation.")); - /* Handle vm_page_rename(m, new_object, ...). */ - if (popmap_is_set(rv->popmap, index)) - return (NULL); - vm_reserv_populate(rv, index); - return (m); } /* @@ -759,14 +945,8 @@ { int begin_zeroes, hi, i, lo; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - KASSERT(rv->object != NULL, - ("vm_reserv_break: reserv %p is free", rv)); - KASSERT(!rv->inpartpopq, - ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv)); - LIST_REMOVE(rv, objq); - rv->object = NULL; - rv->domain = -1; + vm_domain_free_assert_locked(VM_DOMAIN(rv->domain)); + vm_reserv_remove(rv); if (m != NULL) { /* * Since the reservation is being broken, there is no harm in @@ -830,9 +1010,26 @@ vm_reserv_break_all(vm_object_t object) { vm_reserv_t rv; + struct vm_domain *vmd; - mtx_lock(&vm_page_queue_free_mtx); + /* + * This access of object->rvq is unsynchronized so that the + * object rvq lock can nest after the domain_free lock. We + * must check for races in the results. However, the object + * lock prevents new additions, so we are guaranteed that when + * it returns NULL the object is properly empty. + */ + vmd = NULL; while ((rv = LIST_FIRST(&object->rvq)) != NULL) { + if (vmd != VM_DOMAIN(rv->domain)) { + if (vmd != NULL) + vm_domain_free_unlock(vmd); + vmd = VM_DOMAIN(rv->domain); + vm_domain_free_lock(vmd); + } + /* Reclaim race. */ + if (rv->object != object) + continue; KASSERT(rv->object == object, ("vm_reserv_break_all: reserv %p is corrupted", rv)); if (rv->inpartpopq) { @@ -841,7 +1038,8 @@ } vm_reserv_break(rv, NULL); } - mtx_unlock(&vm_page_queue_free_mtx); + if (vmd != NULL) + vm_domain_free_unlock(vmd); } /* @@ -855,8 +1053,8 @@ { vm_reserv_t rv; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); + vm_domain_free_assert_locked(VM_DOMAIN(rv->domain)); if (rv->object == NULL) return (FALSE); vm_reserv_depopulate(rv, m - rv->pages); @@ -886,6 +1084,8 @@ while (paddr + VM_LEVEL_0_SIZE <= seg->end) { vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages = PHYS_TO_VM_PAGE(paddr); + vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].domain = + seg->domain; paddr += VM_LEVEL_0_SIZE; } } @@ -902,8 +1102,8 @@ { vm_reserv_t rv; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); + vm_domain_free_assert_locked(VM_DOMAIN(rv->domain)); if (rv->object == NULL) return (false); return (popmap_is_clear(rv->popmap, m - rv->pages)); @@ -945,7 +1145,7 @@ vm_reserv_reclaim(vm_reserv_t rv) { - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(rv->domain)); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains, @@ -969,7 +1169,7 @@ { vm_reserv_t rv; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(domain)); if ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) { vm_reserv_reclaim(rv); return (TRUE); @@ -993,7 +1193,7 @@ vm_reserv_t rv; int hi, i, lo, low_index, next_free; - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); + vm_domain_free_assert_locked(VM_DOMAIN(domain)); if (npages > VM_LEVEL_0_NPAGES - 1) return (FALSE); size = npages << PAGE_SHIFT; @@ -1084,14 +1284,19 @@ VM_OBJECT_ASSERT_WLOCKED(new_object); rv = vm_reserv_from_page(m); if (rv->object == old_object) { - mtx_lock(&vm_page_queue_free_mtx); + vm_domain_free_lock(VM_DOMAIN(rv->domain)); if (rv->object == old_object) { + vm_reserv_object_lock(old_object); + rv->object = NULL; LIST_REMOVE(rv, objq); - LIST_INSERT_HEAD(&new_object->rvq, rv, objq); + vm_reserv_object_unlock(old_object); + vm_reserv_object_lock(new_object); rv->object = new_object; rv->pindex -= old_object_offset; + LIST_INSERT_HEAD(&new_object->rvq, rv, objq); + vm_reserv_object_unlock(new_object); } - mtx_unlock(&vm_page_queue_free_mtx); + vm_domain_free_unlock(VM_DOMAIN(rv->domain)); } } @@ -1121,6 +1326,7 @@ { vm_paddr_t new_end; size_t size; + int i; /* * Calculate the size (in bytes) of the reservation array. Round up @@ -1139,6 +1345,10 @@ vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero(vm_reserv_array, size); + + for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++) + mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL, + MTX_DEF); /* * Return the next available physical address. Index: sys/vm/vm_swapout.c =================================================================== --- sys/vm/vm_swapout.c +++ sys/vm/vm_swapout.c @@ -650,7 +650,7 @@ loop: if (vm_page_count_min()) { - VM_WAIT; + vm_wait_min(); goto loop; } Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -1167,7 +1167,7 @@ * daemon up. This should be probably be addressed XXX. */ - if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min) + if (vm_page_count_min()) flags |= VM_PAGER_PUT_SYNC; /*