Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -123,6 +123,7 @@ vm_map_t map; vm_map_entry_t entry; int lookup_still_valid; + int ra_done; struct vnode *vp; }; @@ -137,7 +138,12 @@ vm_page_xunbusy(fs->m); vm_page_lock(fs->m); - vm_page_deactivate(fs->m); + if (fs->m->valid == 0) { + if (fs->m->wire_count == 0) + vm_page_free(fs->m); + } else { + vm_page_deactivate(fs->m); + } vm_page_unlock(fs->m); fs->m = NULL; } @@ -292,13 +298,15 @@ struct faultstate fs; struct vnode *vp; vm_page_t m; - int ahead, behind, cluster_offset, error, locked; + int ahead, behind, cluster_offset, dead, error, locked, rv; + u_char behavior; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = 0; + fs.ra_done = FALSE; RetryFault:; @@ -412,7 +420,7 @@ fs.lookup_still_valid = TRUE; - fs.first_m = NULL; + fs.m = fs.first_m = NULL; /* * Search for the page at object/offset. @@ -421,11 +429,20 @@ fs.pindex = fs.first_pindex; while (TRUE) { /* - * If the object is dead, we stop here + * If the object is marked for imminent termination, + * we retry here, since the collapse pass has raced + * with us. Otherwise, if we see terminally dead + * object, return fail. */ - if (fs.object->flags & OBJ_DEAD) { + if ((fs.object->flags & OBJ_DEAD) != 0) { + dead = fs.object->type == OBJT_DEAD; + if (fs.m != NULL && fs.m != fs.first_m) + release_page(&fs); unlock_and_deallocate(&fs); - return (KERN_PROTECTION_FAILURE); + if (dead) + return (KERN_PROTECTION_FAILURE); + pause("vmf_de", 1); + goto RetryFault; } /* @@ -550,9 +567,18 @@ * at the same time. */ if (fs.object->type != OBJT_DEFAULT) { - int rv; - u_char behavior = vm_map_entry_behavior(fs.entry); - + if (!fs.lookup_still_valid) { + locked = vm_map_trylock_read(fs.map); + if (locked) + fs.lookup_still_valid = TRUE; + if (!locked || fs.map->timestamp != + map_generation) { + release_page(&fs); + unlock_and_deallocate(&fs); + goto RetryFault; + } + } + behavior = vm_map_entry_behavior(fs.entry); era = fs.entry->read_ahead; if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { @@ -563,7 +589,8 @@ behind = 0; nera = VM_FAULT_READ_AHEAD_MAX; ahead = nera; - if (fs.pindex == fs.entry->next_read) + if (fs.pindex == fs.entry->next_read && + !fs.ra_done) vm_fault_dontneed(&fs, vaddr, ahead); } else if (fs.pindex == fs.entry->next_read) { /* @@ -574,14 +601,17 @@ * x (read ahead min + 1) + read ahead min" */ behind = 0; - nera = VM_FAULT_READ_AHEAD_MIN; - if (era > 0) { - nera += era + 1; - if (nera > VM_FAULT_READ_AHEAD_MAX) - nera = VM_FAULT_READ_AHEAD_MAX; + if (!fs.ra_done) { + if (era > 0) { + nera += era + 1; + if (nera > VM_FAULT_READ_AHEAD_MAX) + nera = VM_FAULT_READ_AHEAD_MAX; + } else + nera = VM_FAULT_READ_AHEAD_MIN; } ahead = nera; - if (era == VM_FAULT_READ_AHEAD_MAX) + if (era == VM_FAULT_READ_AHEAD_MAX && + !fs.ra_done) vm_fault_dontneed(&fs, vaddr, ahead); } else { /* @@ -603,7 +633,18 @@ } ahead = ulmin(ahead, atop(fs.entry->end - vaddr) - 1); if (era != nera) + /* + * Only read-lock on map is held + * there. It is fine for other thread + * faulting on the same entry to race + * with us for this update, causing + * some inaccuracy in the read-ahead + * heuristic. We do not separate two + * different streams of sequential + * faults on one entry anyway. + */ fs.entry->read_ahead = nera; + fs.ra_done = TRUE; /* * Call the pager to retrieve the data, if any, after @@ -924,8 +965,10 @@ * If the page was filled by a pager, update the map entry's * last read offset. * - * XXX The following assignment modifies the map - * without holding a write lock on it. + * The next_read assignment modifies the map without holding a + * write lock on it, which is acceptable. See the motivation + * in the comment above, before updating the entry->read_ahead + * field in the same manner. */ if (hardfault) fs.entry->next_read = fs.pindex + ahead + 1; Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -229,6 +229,7 @@ LIST_INIT(&object->shadow_head); object->type = type; + VM_OBJECT_WLOCK(object); switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); @@ -252,6 +253,7 @@ default: panic("_vm_object_allocate: type %d is undefined", type); } + VM_OBJECT_WUNLOCK(object); object->size = size; object->generation = 1; object->ref_count = 1; @@ -1584,7 +1586,7 @@ continue; } - KASSERT(pp == NULL || pp->valid != 0, + KASSERT(pp == NULL || pp->wire_count > 0 || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -169,10 +169,16 @@ /* * don't double-terminate the object */ - if ((obj->flags & OBJ_DEAD) == 0) + if ((obj->flags & OBJ_DEAD) == 0) { vm_object_terminate(obj); - else + } else { + if ((obj->flags & OBJ_DISCONNECTWNT) != 0) { + vm_object_clear_flag(obj, OBJ_DISCONNECTWNT); + wakeup(obj); + } + vp->v_object = NULL; VM_OBJECT_WUNLOCK(obj); + } } else { /* * Woe to the process that tries to page now :-). @@ -180,7 +186,7 @@ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } - vp->v_object = NULL; + KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); }