Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -123,6 +123,7 @@ vm_map_t map; vm_map_entry_t entry; int lookup_still_valid; + int ra_done; struct vnode *vp; }; @@ -137,7 +138,12 @@ vm_page_xunbusy(fs->m); vm_page_lock(fs->m); - vm_page_deactivate(fs->m); + if (fs->m->valid == 0) { + if (fs->m->wire_count == 0) + vm_page_free(fs->m); + } else { + vm_page_deactivate(fs->m); + } vm_page_unlock(fs->m); fs->m = NULL; } @@ -292,13 +298,15 @@ struct faultstate fs; struct vnode *vp; vm_page_t m; - int ahead, behind, cluster_offset, error, locked; + int ahead, behind, cluster_offset, dead, error, locked, rv; + u_char behavior; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = 0; + fs.ra_done = FALSE; RetryFault:; @@ -412,7 +420,7 @@ fs.lookup_still_valid = TRUE; - fs.first_m = NULL; + fs.m = fs.first_m = NULL; /* * Search for the page at object/offset. @@ -421,11 +429,20 @@ fs.pindex = fs.first_pindex; while (TRUE) { /* - * If the object is dead, we stop here + * If the object is marked for imminent termination, + * we retry here, since the collapse pass has raced + * with us. Otherwise, if we see terminally dead + * object, return fail. */ - if (fs.object->flags & OBJ_DEAD) { + if ((fs.object->flags & OBJ_DEAD) != 0) { + dead = fs.object->type == OBJT_DEAD; + if (fs.m != NULL && fs.m != fs.first_m) + release_page(&fs); unlock_and_deallocate(&fs); - return (KERN_PROTECTION_FAILURE); + if (dead) + return (KERN_PROTECTION_FAILURE); + pause("vmf_de", 1); + goto RetryFault; } /* @@ -550,9 +567,18 @@ * at the same time. */ if (fs.object->type != OBJT_DEFAULT) { - int rv; - u_char behavior = vm_map_entry_behavior(fs.entry); - + if (!fs.lookup_still_valid) { + locked = vm_map_trylock_read(fs.map); + if (locked) + fs.lookup_still_valid = TRUE; + if (!locked || fs.map->timestamp != + map_generation) { + release_page(&fs); + unlock_and_deallocate(&fs); + goto RetryFault; + } + } + behavior = vm_map_entry_behavior(fs.entry); era = fs.entry->read_ahead; if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { @@ -563,7 +589,8 @@ behind = 0; nera = VM_FAULT_READ_AHEAD_MAX; ahead = nera; - if (fs.pindex == fs.entry->next_read) + if (fs.pindex == fs.entry->next_read && + !fs.ra_done) vm_fault_dontneed(&fs, vaddr, ahead); } else if (fs.pindex == fs.entry->next_read) { /* @@ -574,14 +601,17 @@ * x (read ahead min + 1) + read ahead min" */ behind = 0; - nera = VM_FAULT_READ_AHEAD_MIN; - if (era > 0) { - nera += era + 1; - if (nera > VM_FAULT_READ_AHEAD_MAX) - nera = VM_FAULT_READ_AHEAD_MAX; + if (!fs.ra_done) { + if (era > 0) { + nera += era + 1; + if (nera > VM_FAULT_READ_AHEAD_MAX) + nera = VM_FAULT_READ_AHEAD_MAX; + } else + nera = VM_FAULT_READ_AHEAD_MIN; } ahead = nera; - if (era == VM_FAULT_READ_AHEAD_MAX) + if (era == VM_FAULT_READ_AHEAD_MAX && + !fs.ra_done) vm_fault_dontneed(&fs, vaddr, ahead); } else { /* @@ -603,7 +633,18 @@ } ahead = ulmin(ahead, atop(fs.entry->end - vaddr) - 1); if (era != nera) + /* + * Only read-lock on map is held + * there. It is fine for other thread + * faulting on the same entry to race + * with us for this update, causing + * some inaccuracy in the read-ahead + * heuristic. We do not separate two + * different streams of sequential + * faults on one entry anyway. + */ fs.entry->read_ahead = nera; + fs.ra_done = TRUE; /* * Call the pager to retrieve the data, if any, after @@ -924,8 +965,10 @@ * If the page was filled by a pager, update the map entry's * last read offset. * - * XXX The following assignment modifies the map - * without holding a write lock on it. + * The next_read assignment modifies the map without holding a + * write lock on it, which is acceptable. See the motivation + * in the comment above, before updating the entry->read_ahead + * field in the same manner. */ if (hardfault) fs.entry->next_read = fs.pindex + ahead + 1; Index: sys/vm/vm_meter.c =================================================================== --- sys/vm/vm_meter.c +++ sys/vm/vm_meter.c @@ -209,8 +209,10 @@ total.t_armshr += object->resident_page_count; } } + object->actmark = FALSE; } mtx_unlock(&vm_object_list_mtx); + sx_xunlock(&vmmeter_lock); total.t_free = vm_cnt.v_free_count + vm_cnt.v_cache_count; return (sysctl_handle_opaque(oidp, &total, sizeof(total), req)); } Index: sys/vm/vm_object.h =================================================================== --- sys/vm/vm_object.h +++ sys/vm/vm_object.h @@ -94,6 +94,7 @@ * (c) const until freed * (o) per-object lock * (f) free pages queue mutex + * (v) vm_meter lock * */ @@ -110,6 +111,7 @@ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ + u_char actmark; /* (v) accounted as active */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -1584,7 +1584,7 @@ continue; } - KASSERT(pp == NULL || pp->valid != 0, + KASSERT(pp == NULL || pp->wire_count > 0 || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -552,6 +552,7 @@ (m)); \ } while (0) +/* Note: the lock of page m must not be owned by caller. */ #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \ Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -760,17 +760,36 @@ } static void +vm_page_xunbusy_locked(vm_page_t m) +{ + + vm_page_assert_xbusied(m); + vm_page_assert_locked(m); + + atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); + wakeup(m); +} + +static void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); + /* + * Fast path for unbusy. If it succeed, we know that there + * are no waiters, so we do not need a wakeup. + */ + if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, + VPB_UNBUSIED)) + return; + lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); - vm_page_flash(m); - atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); + /* There is a waiter, do wakeup() instead of vm_page_flash(). */ + vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } @@ -788,8 +807,7 @@ vm_page_assert_xbusied(m); vm_page_lock(m); - atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); - wakeup(m); + vm_page_xunbusy_locked(m); vm_page_unlock(m); } Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -169,10 +169,16 @@ /* * don't double-terminate the object */ - if ((obj->flags & OBJ_DEAD) == 0) + if ((obj->flags & OBJ_DEAD) == 0) { vm_object_terminate(obj); - else + } else { + if ((obj->flags & OBJ_DISCONNECTWNT) != 0) { + vm_object_clear_flag(obj, OBJ_DISCONNECTWNT); + wakeup(obj); + } + vp->v_object = NULL; VM_OBJECT_WUNLOCK(obj); + } } else { /* * Woe to the process that tries to page now :-). @@ -180,7 +186,7 @@ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } - vp->v_object = NULL; + KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); }