Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -5656,8 +5656,8 @@ KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); Index: sys/arm/arm/pmap-v4.c =================================================================== --- sys/arm/arm/pmap-v4.c +++ sys/arm/arm/pmap-v4.c @@ -2979,8 +2979,8 @@ pa = systempage.pv_pa; m = NULL; } else { - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); } nflags = 0; Index: sys/arm/arm/pmap-v6.c =================================================================== --- sys/arm/arm/pmap-v6.c +++ sys/arm/arm/pmap-v6.c @@ -3876,8 +3876,8 @@ KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("%s: managed mapping within the clean submap", __func__)); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("%s: flags %u has reserved bits set", __func__, flags)); pa = VM_PAGE_TO_PHYS(m); Index: sys/arm64/arm64/pmap.c =================================================================== --- sys/arm64/arm64/pmap.c +++ sys/arm64/arm64/pmap.c @@ -3183,8 +3183,8 @@ int lvl, rv; va = trunc_page(va); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c +++ sys/i386/i386/pmap.c @@ -3603,8 +3603,8 @@ KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); Index: sys/mips/mips/pmap.c =================================================================== --- sys/mips/mips/pmap.c +++ sys/mips/mips/pmap.c @@ -2063,8 +2063,8 @@ KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); if ((flags & PMAP_ENTER_WIRED) != 0) Index: sys/powerpc/aim/mmu_oea.c =================================================================== --- sys/powerpc/aim/mmu_oea.c +++ sys/powerpc/aim/mmu_oea.c @@ -1149,8 +1149,8 @@ if (pmap_bootstrapped) rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); if ((m->oflags & VPO_UNMANAGED) != 0 || !moea_initialized) { pvo_head = &moea_pvo_kunmanaged; Index: sys/powerpc/aim/mmu_oea64.c =================================================================== --- sys/powerpc/aim/mmu_oea64.c +++ sys/powerpc/aim/mmu_oea64.c @@ -1395,8 +1395,8 @@ uint64_t pte_lo; int error; - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); pvo = alloc_pvo_entry(0); if (pvo == NULL) Index: sys/powerpc/booke/pmap.c =================================================================== --- sys/powerpc/booke/pmap.c +++ sys/powerpc/booke/pmap.c @@ -2278,8 +2278,8 @@ KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_enter_locked: user pmap, non user va")); } - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); PMAP_LOCK_ASSERT(pmap, MA_OWNED); Index: sys/riscv/riscv/pmap.c =================================================================== --- sys/riscv/riscv/pmap.c +++ sys/riscv/riscv/pmap.c @@ -2643,8 +2643,8 @@ bool nosleep; va = trunc_page(va); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); Index: sys/sparc64/sparc64/pmap.c =================================================================== --- sys/sparc64/sparc64/pmap.c +++ sys/sparc64/sparc64/pmap.c @@ -1500,8 +1500,8 @@ rw_assert(&tte_list_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pm, MA_OWNED); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + if ((m->oflags & VPO_UNMANAGED) == 0) + VM_PAGE_OBJECT_BUSY_ASSERT(m); PMAP_STATS_INC(pmap_nenter); pa = VM_PAGE_TO_PHYS(m); wired = (flags & PMAP_ENTER_WIRED) != 0; Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -278,11 +278,14 @@ int psind, rv; MPASS(fs->vp == NULL); + vm_object_busy(fs->first_object); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && - vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) - return (KERN_FAILURE); + vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) { + rv = KERN_FAILURE; + goto out; + } m_map = m; psind = 0; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ @@ -320,7 +323,7 @@ rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type | PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind); if (rv != KERN_SUCCESS) - return (rv); + goto out; if (m_hold != NULL) { *m_hold = m; vm_page_wire(m); @@ -331,7 +334,10 @@ VM_OBJECT_RUNLOCK(fs->first_object); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; - return (KERN_SUCCESS); + +out: + vm_object_unbusy(fs->first_object); + return (rv); } static void @@ -1285,8 +1291,8 @@ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; - vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); vm_page_assert_xbusied(fs.m); + vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); /* * Page must be completely valid or it is not fit to Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -84,6 +84,7 @@ * vm_object_t Virtual memory object. * * List of locks + * (a) atomic * (c) const until freed * (o) per-object lock * (f) free pages queue mutex @@ -112,6 +113,7 @@ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ volatile u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ + volatile u_int busy; /* (a) object is busy, disallow page busy. */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ @@ -313,6 +315,18 @@ void vm_object_pip_wait(vm_object_t object, char *waitid); void vm_object_pip_wait_unlocked(vm_object_t object, char *waitid); +void vm_object_busy(vm_object_t object); +void vm_object_unbusy(vm_object_t object); +void vm_object_busy_wait(vm_object_t object, const char *wmesg); + +static inline bool +vm_object_busied(vm_object_t object) +{ + + return (object->busy != 0); +} +#define VM_OBJECT_ASSERT_BUSY(object) MPASS(vm_object_busied((object))) + void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); extern int umtx_shm_vnobj_persistent; Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -195,6 +196,9 @@ ("object %p has reservations", object)); #endif + KASSERT(object->busy == 0, + ("object %p busy = %d", + object, object->busy)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); @@ -223,6 +227,7 @@ object->ref_count = 0; vm_radix_init(&object->rtree); refcount_init(&object->paging_in_progress, 0); + refcount_init(&object->busy, 0); object->resident_page_count = 0; object->shadow_count = 0; object->flags = OBJ_DEAD; @@ -2238,6 +2243,40 @@ return (vp); } + +/* + * Busy the vm object. This prevents new pages belonging to the object from + * becoming busy. Existing pages persist as busy. Callers are responsible + * for checking page state before proceeding. + */ +void +vm_object_busy(vm_object_t obj) +{ + + VM_OBJECT_ASSERT_LOCKED(obj); + + refcount_acquire(&obj->busy); +} + +void +vm_object_unbusy(vm_object_t obj) +{ + + VM_OBJECT_ASSERT_LOCKED(obj); + + refcount_release(&obj->busy); +} + +void +vm_object_busy_wait(vm_object_t obj, const char *wmesg) +{ + + VM_OBJECT_ASSERT_UNLOCKED(obj); + + if (obj->busy) + refcount_sleep(&obj->busy, wmesg, PVM); +} + /* * Return the kvme type of the given object. * If vpp is not NULL, set it to the object's vm_object_vnode() or NULL. Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -615,6 +615,7 @@ bool vm_page_try_remove_all(vm_page_t m); bool vm_page_try_remove_write(vm_page_t m); int vm_page_trysbusy(vm_page_t m); +int vm_page_tryxbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unswappable(vm_page_t m); void vm_page_unwire(vm_page_t m, uint8_t queue); @@ -666,10 +667,6 @@ (m)); \ } while (0) -#define vm_page_tryxbusy(m) \ - (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \ - VPB_SINGLE_EXCLUSIVER)) - #define vm_page_xbusied(m) \ (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) @@ -687,13 +684,13 @@ } while (0) #ifdef INVARIANTS -void vm_page_object_lock_assert(vm_page_t m); -#define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) +void vm_page_object_busy_assert(vm_page_t m); +#define VM_PAGE_OBJECT_BUSY_ASSERT(m) vm_page_object_busy_assert(m) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else -#define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 +#define VM_PAGE_OBJECT_BUSY_ASSERT(m) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif @@ -837,7 +834,7 @@ vm_page_undirty(vm_page_t m) { - VM_PAGE_OBJECT_LOCK_ASSERT(m); + VM_PAGE_OBJECT_BUSY_ASSERT(m); m->dirty = 0; } Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -157,6 +157,8 @@ static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); +static void _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, + const char *wmesg, bool nonshared, bool locked); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_dequeue_complete(vm_page_t m); static void vm_page_enqueue(vm_page_t m, uint8_t queue); @@ -875,7 +877,6 @@ vm_page_busy_acquire(vm_page_t m, int allocflags) { vm_object_t obj; - u_int x; bool locked; /* @@ -896,27 +897,13 @@ } if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (FALSE); - if (obj != NULL) { + if (obj != NULL) locked = VM_OBJECT_WOWNED(obj); - } else { - MPASS(vm_page_wired(m)); + else locked = FALSE; - } - sleepq_lock(m); - x = m->busy_lock; - if (x == VPB_UNBUSIED || - ((allocflags & VM_ALLOC_SBUSY) != 0 && - (x & VPB_BIT_SHARED) != 0) || - ((x & VPB_BIT_WAITERS) == 0 && - !atomic_cmpset_int(&m->busy_lock, x, - x | VPB_BIT_WAITERS))) { - sleepq_release(m); - continue; - } - if (locked) - VM_OBJECT_WUNLOCK(obj); - sleepq_add(m, NULL, "vmpba", 0, 0); - sleepq_wait(m, PVM); + MPASS(locked || vm_page_wired(m)); + _vm_page_busy_sleep(obj, m, "vmpba", + (allocflags & VM_ALLOC_SBUSY) != 0, locked); if (locked) VM_OBJECT_WLOCK(obj); MPASS(m->object == obj || m->object == NULL); @@ -1032,22 +1019,42 @@ vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { vm_object_t obj; - u_int x; obj = m->object; - vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_LOCKED(obj); + vm_page_lock_assert(m, MA_NOTOWNED); + _vm_page_busy_sleep(obj, m, wmesg, nonshared, true); +} + +static void +_vm_page_busy_sleep(vm_object_t obj, vm_page_t m, const char *wmesg, + bool nonshared, bool locked) +{ + u_int x; + + /* + * If the object is busy we must wait for that to drain to zero + * before trying the page again. + */ + if (obj != NULL && vm_object_busied(obj)) { + if (locked) + VM_OBJECT_DROP(obj); + vm_object_busy_wait(obj, wmesg); + return; + } sleepq_lock(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { - VM_OBJECT_DROP(obj); + if (locked) + VM_OBJECT_DROP(obj); sleepq_release(m); return; } - VM_OBJECT_DROP(obj); + if (locked) + VM_OBJECT_DROP(obj); sleepq_add(m, NULL, wmesg, 0, 0); sleepq_wait(m, PVM); } @@ -1062,16 +1069,56 @@ int vm_page_trysbusy(vm_page_t m) { + vm_object_t obj; u_int x; + obj = m->object; x = m->busy_lock; for (;;) { if ((x & VPB_BIT_SHARED) == 0) return (0); + /* + * Reduce the window for transient busies that will trigger + * false negatives in vm_page_ps_test(). + */ + if (obj != NULL && vm_object_busied(obj)) + return (0); if (atomic_fcmpset_acq_int(&m->busy_lock, &x, x + VPB_ONE_SHARER)) - return (1); + break; } + + /* Refetch the object now that we're guaranteed that it is stable. */ + obj = m->object; + if (obj != NULL && vm_object_busied(obj)) { + vm_page_sunbusy(m); + return (0); + } + return (1); +} + +/* + * vm_page_tryxbusy: + * + * Try to exclusive busy a page. + * If the operation succeeds 1 is returned otherwise 0. + * The operation never sleeps. + */ +int +vm_page_tryxbusy(vm_page_t m) +{ + vm_object_t obj; + + if (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, + VPB_SINGLE_EXCLUSIVER) == 0) + return (0); + + obj = m->object; + if (obj != NULL && vm_object_busied(obj)) { + vm_page_xunbusy(m); + return (0); + } + return (1); } /* @@ -1293,15 +1340,15 @@ vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); - if (vm_page_busied(m)) { - /* - * The page-specific object must be cached because page - * identity can change during the sleep, causing the - * re-lock of a different object. - * It is assumed that a reference to the object is already - * held by the callers. - */ - obj = m->object; + /* + * The page-specific object must be cached because page + * identity can change during the sleep, causing the + * re-lock of a different object. + * It is assumed that a reference to the object is already + * held by the callers. + */ + obj = m->object; + if (vm_page_busied(m) || (obj != NULL && obj->busy)) { vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); @@ -1326,15 +1373,15 @@ vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); - if (vm_page_xbusied(m)) { - /* - * The page-specific object must be cached because page - * identity can change during the sleep, causing the - * re-lock of a different object. - * It is assumed that a reference to the object is already - * held by the callers. - */ - obj = m->object; + /* + * The page-specific object must be cached because page + * identity can change during the sleep, causing the + * re-lock of a different object. + * It is assumed that a reference to the object is already + * held by the callers. + */ + obj = m->object; + if (vm_page_xbusied(m) || (obj != NULL && obj->busy)) { vm_page_busy_sleep(m, msg, true); VM_OBJECT_WLOCK(obj); return (TRUE); @@ -4806,17 +4853,15 @@ #ifdef INVARIANTS void -vm_page_object_lock_assert(vm_page_t m) +vm_page_object_busy_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the - * holder of the containing object's lock or the exclusive busy. - * holder. Unfortunately, the holder of the write busy is - * not recorded, and thus cannot be checked here. + * holder of a page or object busy. */ - if (m->object != NULL && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_WLOCKED(m->object); + if (m->object != NULL && !vm_page_busied(m)) + VM_OBJECT_ASSERT_BUSY(m->object); } void @@ -4834,7 +4879,7 @@ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); + VM_OBJECT_ASSERT_BUSY(m->object); } #endif