diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -130,6 +130,7 @@ bool oom_started; int nera; bool can_read_lock; + bool can_sbusy; /* Page reference for cow. */ vm_page_t m_cow; @@ -165,6 +166,7 @@ FAULT_OUT_OF_BOUNDS, /* Invalid address for pager. */ FAULT_HARD, /* Performed I/O. */ FAULT_SOFT, /* Found valid page. */ + FAULT_SOFT_MSHAREDBUSY, /* Found valid page and busied it shared. */ FAULT_PROTECTION_FAILURE, /* Invalid access. */ }; @@ -204,7 +206,10 @@ * pageout while optimizing fault restarts. */ vm_page_deactivate(m); - vm_page_xunbusy(m); + if (vm_page_xbusied(m)) + vm_page_xunbusy(m); + else + vm_page_sunbusy(m); *mp = NULL; } } @@ -1002,10 +1007,20 @@ return (KERN_SUCCESS); } -static void -vm_fault_cow(struct faultstate *fs) +static bool +vm_fault_can_cow_rename(struct faultstate *fs) { - bool is_first_object_locked; + return ( + /* Only one shadow object and no other refs. */ + fs->object->shadow_count == 1 && fs->object->ref_count == 1 && + /* No other ways to look the object up. */ + fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0); +} + +static void +vm_fault_cow(struct faultstate *fs, int res) +{ + bool is_first_object_locked, fast_cow; KASSERT(fs->object != fs->first_object, ("source and target COW objects are identical")); @@ -1019,21 +1034,28 @@ * object so that it will go out to swap when needed. */ is_first_object_locked = false; - if ( - /* - * Only one shadow object and no other refs. - */ - fs->object->shadow_count == 1 && fs->object->ref_count == 1 && - /* - * No other ways to look the object up - */ - fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && - /* - * We don't chase down the shadow chain and we can acquire locks. - */ - (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && - fs->object == fs->first_object->backing_object && - VM_OBJECT_TRYWLOCK(fs->object)) { + fast_cow = false; + + if (vm_fault_can_cow_rename(fs)) { + /* + * Check that we don't chase down the shadow chain and + * we can acquire locks. + */ + is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object); + if (is_first_object_locked && + fs->object == fs->first_object->backing_object) { + if (res == FAULT_SOFT_MSHAREDBUSY) { + fast_cow = VM_OBJECT_WOWNED(fs->object) || + VM_OBJECT_TRYUPGRADE(fs->object); + } else { + fast_cow = VM_OBJECT_TRYWLOCK(fs->object); + } + } + } + + if (fast_cow) { + vm_page_assert_xbusied(fs->m); + /* * Remove but keep xbusy for replace. fs->m is moved into * fs->first_object and left busy while fs->first_m is @@ -1084,16 +1106,23 @@ * removed from those other address spaces. * * The flag check is racy, but this is tolerable: if - * OBJ_ONEMAPPING is cleared after the check, the busy state - * ensures that new mappings of m_cow can't be created. - * pmap_enter() will replace an existing mapping in the current - * address space. If OBJ_ONEMAPPING is set after the check, - * removing mappings will at worse trigger some unnecessary page - * faults. + * OBJ_ONEMAPPING is cleared after the check, either + * the exclusive busy state or the check for + * shadow_count in vm_fault_object() ensures that new + * mappings of m_cow can't be created. pmap_enter() + * will replace an existing mapping in the current + * address space. If OBJ_ONEMAPPING is set after the + * check, removing mappings will at worse trigger some + * unnecessary page faults. */ - vm_page_assert_xbusied(fs->m_cow); + if (res == FAULT_SOFT_MSHAREDBUSY) + vm_page_assert_busied(fs->m_cow); + else + vm_page_assert_xbusied(fs->m_cow); if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0) pmap_remove_all(fs->m_cow); + if (res == FAULT_SOFT_MSHAREDBUSY) + VM_OBJECT_UNLOCK(fs->object); } vm_object_pip_wakeup(fs->object); @@ -1487,6 +1516,32 @@ vm_page_iter_init(&pages, fs->object); fs->m = vm_radix_iter_lookup(&pages, fs->pindex); if (fs->m != NULL) { + /* + * Unlocked check for validity, rechecked after busy + * is obtained. + */ + if (vm_page_all_valid(fs->m) && fs->can_sbusy && + ((fs->fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) == 0 || + fs->object != fs->first_object) && + !(vm_fault_can_cow_rename(fs) && + fs->object == fs->first_object->backing_object)) { + if (!vm_page_trysbusy(fs->m)) { +restart: + fs->can_sbusy = false; + vm_fault_busy_sleep(fs); + return (FAULT_RESTART); + } + if (!vm_page_all_valid(fs->m)) { + vm_page_sunbusy(fs->m); + goto restart; + } + /* + * Keep fs->object locked for validity of the + * CoW checks. + */ + return (FAULT_SOFT_MSHAREDBUSY); + } + if (!vm_page_tryxbusy(fs->m)) { vm_fault_busy_sleep(fs); return (FAULT_RESTART); @@ -1546,8 +1601,8 @@ return (res); } -int -vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, +static int +vm_fault_rglocked(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct pctrie_iter pages; @@ -1555,12 +1610,7 @@ int ahead, behind, faultcount, rv; enum fault_status res; enum fault_next_status res_next; - bool hardfault; - - VM_CNT_INC(v_vm_faults); - - if ((curthread->td_pflags & TDP_NOFAULTING) != 0) - return (KERN_PROTECTION_FAILURE); + bool hardfault, unlock_object; fs.vp = NULL; fs.vaddr = vaddr; @@ -1571,6 +1621,7 @@ fs.oom_started = false; fs.nera = -1; fs.can_read_lock = true; + fs.can_sbusy = true; faultcount = 0; hardfault = false; @@ -1654,6 +1705,7 @@ res = vm_fault_object(&fs, &behind, &ahead); switch (res) { case FAULT_SOFT: + case FAULT_SOFT_MSHAREDBUSY: goto found; case FAULT_HARD: faultcount = behind + 1 + ahead; @@ -1704,8 +1756,14 @@ * A valid page has been found and exclusively busied. The * object lock must no longer be held. */ - vm_page_assert_xbusied(fs.m); - VM_OBJECT_ASSERT_UNLOCKED(fs.object); + vm_page_assert_busied(fs.m); + if (res != FAULT_SOFT_MSHAREDBUSY) { + unlock_object = false; + VM_OBJECT_ASSERT_UNLOCKED(fs.object); + } else { + unlock_object = true; + VM_OBJECT_ASSERT_LOCKED(fs.object); + } /* * If the page is being written, but isn't already owned by the @@ -1713,11 +1771,14 @@ * top-level object. */ if (fs.object != fs.first_object) { + /* * We only really need to copy if we want to write it. */ if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { - vm_fault_cow(&fs); + vm_fault_cow(&fs, res); + unlock_object = false; + /* * We only try to prefault read-only mappings to the * neighboring pages when this copy-on-write fault is @@ -1731,6 +1792,8 @@ fs.prot &= ~VM_PROT_WRITE; } } + if (unlock_object) + VM_OBJECT_UNLOCK(fs.object); /* * We must verify that the maps have not changed since our last @@ -1773,7 +1836,7 @@ * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ - vm_page_assert_xbusied(fs.m); + vm_page_assert_busied(fs.m); KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); @@ -1805,7 +1868,10 @@ (*fs.m_hold) = fs.m; vm_page_wire(fs.m); } - vm_page_xunbusy(fs.m); + if (vm_page_xbusied(fs.m)) + vm_page_xunbusy(fs.m); + else + vm_page_sunbusy(fs.m); fs.m = NULL; /* @@ -1836,6 +1902,24 @@ return (KERN_SUCCESS); } +int +vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, + int fault_flags, vm_page_t *m_hold) +{ + void *cookie; + int rv; + + VM_CNT_INC(v_vm_faults); + + if ((curthread->td_pflags & TDP_NOFAULTING) != 0) + return (KERN_PROTECTION_FAILURE); + + cookie = rangelock_wlock(&map->fltlock, vaddr, vaddr + PAGE_SIZE); + rv = vm_fault_rglocked(map, vaddr, fault_type, fault_flags, m_hold); + rangelock_unlock(&map->fltlock, cookie); + return (rv); +} + /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -65,6 +65,7 @@ #define _VM_MAP_ #include +#include #include #include @@ -206,6 +207,7 @@ struct sx lock; /* Lock for map data */ struct mtx system_mtx; }; + struct rangelock fltlock; int nentries; /* Number of entries */ vm_size_t size; /* virtual size */ u_int timestamp; /* Version number */ diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -896,6 +896,7 @@ map->timestamp = 0; map->busy = 0; map->anon_loc = 0; + rangelock_init(&map->fltlock); #ifdef DIAGNOSTIC map->nupdates = 0; #endif