Changeset View
Standalone View
sys/vm/vm_fault.c
| Show First 20 Lines • Show All 65 Lines • ▼ Show 20 Lines | |||||||||||||
| * any improvements or extensions that they make and grant Carnegie the | * any improvements or extensions that they make and grant Carnegie the | ||||||||||||
| * rights to redistribute these changes. | * rights to redistribute these changes. | ||||||||||||
| */ | */ | ||||||||||||
| /* | /* | ||||||||||||
| * Page fault handling module. | * Page fault handling module. | ||||||||||||
| */ | */ | ||||||||||||
| #include <sys/cdefs.h> | |||||||||||||
| #include "opt_ktrace.h" | #include "opt_ktrace.h" | ||||||||||||
| #include "opt_vm.h" | #include "opt_vm.h" | ||||||||||||
| #include <sys/param.h> | |||||||||||||
| #include <sys/systm.h> | #include <sys/systm.h> | ||||||||||||
| #include <sys/kernel.h> | #include <sys/kernel.h> | ||||||||||||
| #include <sys/lock.h> | #include <sys/lock.h> | ||||||||||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||||||||||
| #include <sys/mutex.h> | #include <sys/mutex.h> | ||||||||||||
| #include <sys/pctrie.h> | #include <sys/pctrie.h> | ||||||||||||
| #include <sys/proc.h> | #include <sys/proc.h> | ||||||||||||
| #include <sys/racct.h> | #include <sys/racct.h> | ||||||||||||
| ▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines | |||||||||||||
| enum fault_status { | enum fault_status { | ||||||||||||
| FAULT_SUCCESS = 10000, /* Return success to user. */ | FAULT_SUCCESS = 10000, /* Return success to user. */ | ||||||||||||
| FAULT_FAILURE, /* Return failure to user. */ | FAULT_FAILURE, /* Return failure to user. */ | ||||||||||||
| FAULT_CONTINUE, /* Continue faulting. */ | FAULT_CONTINUE, /* Continue faulting. */ | ||||||||||||
| FAULT_RESTART, /* Restart fault. */ | FAULT_RESTART, /* Restart fault. */ | ||||||||||||
| FAULT_OUT_OF_BOUNDS, /* Invalid address for pager. */ | FAULT_OUT_OF_BOUNDS, /* Invalid address for pager. */ | ||||||||||||
| FAULT_HARD, /* Performed I/O. */ | FAULT_HARD, /* Performed I/O. */ | ||||||||||||
| FAULT_SOFT, /* Found valid page. */ | FAULT_SOFT, /* Found valid page. */ | ||||||||||||
| FAULT_PROTECTION_FAILURE, /* Invalid access. */ | FAULT_PROTECTION_FAILURE, /* Invalid access. */ | ||||||||||||
markj: Rather than adding a new result and passing that around, can we instead add a new… | |||||||||||||
| }; | }; | ||||||||||||
| enum fault_next_status { | enum fault_next_status { | ||||||||||||
| FAULT_NEXT_GOTOBJ = 1, | FAULT_NEXT_GOTOBJ = 1, | ||||||||||||
| FAULT_NEXT_NOOBJ, | FAULT_NEXT_NOOBJ, | ||||||||||||
| FAULT_NEXT_RESTART, | FAULT_NEXT_RESTART, | ||||||||||||
| }; | }; | ||||||||||||
| Show All 22 Lines | vm_fault_page_release(vm_page_t *mp) | ||||||||||||
| m = *mp; | m = *mp; | ||||||||||||
| if (m != NULL) { | if (m != NULL) { | ||||||||||||
| /* | /* | ||||||||||||
| * We are likely to loop around again and attempt to busy | * We are likely to loop around again and attempt to busy | ||||||||||||
| * this page. Deactivating it leaves it available for | * this page. Deactivating it leaves it available for | ||||||||||||
| * pageout while optimizing fault restarts. | * pageout while optimizing fault restarts. | ||||||||||||
| */ | */ | ||||||||||||
| vm_page_deactivate(m); | vm_page_deactivate(m); | ||||||||||||
| if (vm_page_xbusied(m)) | |||||||||||||
| vm_page_xunbusy(m); | vm_page_xunbusy(m); | ||||||||||||
| else | |||||||||||||
| vm_page_sunbusy(m); | |||||||||||||
| *mp = NULL; | *mp = NULL; | ||||||||||||
| } | } | ||||||||||||
| } | } | ||||||||||||
| static inline void | static inline void | ||||||||||||
| vm_fault_page_free(vm_page_t *mp) | vm_fault_page_free(vm_page_t *mp) | ||||||||||||
| { | { | ||||||||||||
| vm_page_t m; | vm_page_t m; | ||||||||||||
| ▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines | if (need_dirty && vm_page_set_dirty(m) == 0) { | ||||||||||||
| if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0) | if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0) | ||||||||||||
| vm_page_aflag_set(m, PGA_NOSYNC); | vm_page_aflag_set(m, PGA_NOSYNC); | ||||||||||||
| else | else | ||||||||||||
| vm_page_aflag_clear(m, PGA_NOSYNC); | vm_page_aflag_clear(m, PGA_NOSYNC); | ||||||||||||
| } | } | ||||||||||||
| } | } | ||||||||||||
| static bool | |||||||||||||
| vm_fault_is_read(const struct faultstate *fs) | |||||||||||||
| { | |||||||||||||
| return ((fs->prot & VM_PROT_WRITE) == 0 && | |||||||||||||
| (fs->fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) == 0); | |||||||||||||
| } | |||||||||||||
| static bool | |||||||||||||
| vm_fault_fo_is_onemapping(const struct faultstate *fs) | |||||||||||||
| { | |||||||||||||
| return (fs->object != fs->first_object && | |||||||||||||
| (fs->first_object->flags & OBJ_ONEMAPPING) != 0); | |||||||||||||
| } | |||||||||||||
| /* | /* | ||||||||||||
| * Unlocks fs.first_object and fs.map on success. | * Unlocks fs.first_object and fs.map on success. | ||||||||||||
| */ | */ | ||||||||||||
| static enum fault_status | static enum fault_status | ||||||||||||
| vm_fault_soft_fast(struct faultstate *fs) | vm_fault_soft_fast(struct faultstate *fs) | ||||||||||||
| { | { | ||||||||||||
| vm_page_t m, m_map; | vm_page_t m, m_map; | ||||||||||||
| #if VM_NRESERVLEVEL > 0 | #if VM_NRESERVLEVEL > 0 | ||||||||||||
| Show All 29 Lines | #endif | ||||||||||||
| */ | */ | ||||||||||||
| if (m->object != fs->first_object || m->pindex != fs->first_pindex) | if (m->object != fs->first_object || m->pindex != fs->first_pindex) | ||||||||||||
| goto fail; | goto fail; | ||||||||||||
| vm_object_busy(fs->first_object); | vm_object_busy(fs->first_object); | ||||||||||||
| if (!vm_page_all_valid(m) || | if (!vm_page_all_valid(m) || | ||||||||||||
| ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m))) | ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m))) | ||||||||||||
Done Inline ActionsWe cannot create a writeable mapping while a page is sbusy. Or at least, we do not permit it today. (See, e.g., the assertion in vm_pageout_flush() which checks that the newly-written pages are not mapped for writing.) In other words, I think this part of the change is wrong. As an aside, I wonder why we require the contents of pages to be stable while they are being flushed to disk. For filesystems which perform data checksumming, it is maybe required (not for ZFS since it copies the data to be flushed), but most of our filesystems do not care. Similarly, I cannot see why it would matter for swap-backed pages. So long as we do not incorrectly mark pages clean, I cannot see why this is required from a correctness perspective, though it is certainly easier to reason about. markj: We cannot create a writeable mapping while a page is sbusy. Or at least, we do not permit it… | |||||||||||||
Done Inline ActionsYes, intent was to allow read-mappings for sbusy, but it already happens. We even allow read-mappings for xbusy. Might be this should be hardened and only allow it for sbusy. WRT the second question, I do think it is reasonable to allow writes, unless we completely give up on idea that something consistent is written out. For instance, it is not unreasonable for the app to expect that a store to the mapped file inside the page boundary is either fully written to the storage or not written at all. kib: Yes, intent was to allow read-mappings for sbusy, but it already happens. We even allow read… | |||||||||||||
| goto fail_busy; | goto fail_busy; | ||||||||||||
| m_map = m; | m_map = m; | ||||||||||||
| psind = 0; | psind = 0; | ||||||||||||
| #if VM_NRESERVLEVEL > 0 | #if VM_NRESERVLEVEL > 0 | ||||||||||||
| if ((m->flags & PG_FICTITIOUS) == 0 && | if ((m->flags & PG_FICTITIOUS) == 0 && | ||||||||||||
| (m_super = vm_reserv_to_superpage(m)) != NULL) { | (m_super = vm_reserv_to_superpage(m)) != NULL) { | ||||||||||||
| psind = m_super->psind; | psind = m_super->psind; | ||||||||||||
| ▲ Show 20 Lines • Show All 611 Lines • ▼ Show 20 Lines | vm_fault_relookup(struct faultstate *fs) | ||||||||||||
| /* Reassert because wired may have changed. */ | /* Reassert because wired may have changed. */ | ||||||||||||
| KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0, | KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0, | ||||||||||||
| ("!wired && VM_FAULT_WIRE")); | ("!wired && VM_FAULT_WIRE")); | ||||||||||||
| return (KERN_SUCCESS); | return (KERN_SUCCESS); | ||||||||||||
| } | } | ||||||||||||
| static bool | |||||||||||||
| vm_fault_can_cow_rename(struct faultstate *fs) | |||||||||||||
| { | |||||||||||||
| return ( | |||||||||||||
| /* Only one shadow object and no other refs. */ | |||||||||||||
| fs->object->shadow_count == 1 && fs->object->ref_count == 1 && | |||||||||||||
| /* No other ways to look the object up. */ | |||||||||||||
| fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0); | |||||||||||||
| } | |||||||||||||
| static void | static void | ||||||||||||
| vm_fault_cow(struct faultstate *fs) | vm_fault_cow(struct faultstate *fs, int res) | ||||||||||||
| { | { | ||||||||||||
| bool is_first_object_locked; | bool is_first_object_locked, rename_cow; | ||||||||||||
| KASSERT(fs->object != fs->first_object, | KASSERT(fs->object != fs->first_object, | ||||||||||||
| ("source and target COW objects are identical")); | ("source and target COW objects are identical")); | ||||||||||||
| /* | /* | ||||||||||||
| * This allows pages to be virtually copied from a backing_object | * This allows pages to be virtually copied from a backing_object | ||||||||||||
| * into the first_object, where the backing object has no other | * into the first_object, where the backing object has no other | ||||||||||||
| * refs to it, and cannot gain any more refs. Instead of a bcopy, | * refs to it, and cannot gain any more refs. Instead of a bcopy, | ||||||||||||
| * we just move the page from the backing object to the first | * we just move the page from the backing object to the first | ||||||||||||
| * object. Note that we must mark the page dirty in the first | * object. Note that we must mark the page dirty in the first | ||||||||||||
| * object so that it will go out to swap when needed. | * object so that it will go out to swap when needed. | ||||||||||||
| */ | */ | ||||||||||||
| is_first_object_locked = false; | is_first_object_locked = false; | ||||||||||||
| if ( | rename_cow = false; | ||||||||||||
| if (vm_fault_can_cow_rename(fs)) { | |||||||||||||
| /* | /* | ||||||||||||
| * Only one shadow object and no other refs. | * Check that we don't chase down the shadow chain and | ||||||||||||
| * we can acquire locks. | |||||||||||||
| */ | */ | ||||||||||||
| fs->object->shadow_count == 1 && fs->object->ref_count == 1 && | is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object); | ||||||||||||
| /* | if (is_first_object_locked && | ||||||||||||
| * No other ways to look the object up | |||||||||||||
| */ | |||||||||||||
| fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && | |||||||||||||
| /* | |||||||||||||
| * We don't chase down the shadow chain and we can acquire locks. | |||||||||||||
| */ | |||||||||||||
| (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && | |||||||||||||
| fs->object == fs->first_object->backing_object && | fs->object == fs->first_object->backing_object && | ||||||||||||
| VM_OBJECT_TRYWLOCK(fs->object)) { | vm_page_xbusied(fs->m)) | ||||||||||||
| rename_cow = VM_OBJECT_TRYWLOCK(fs->object); | |||||||||||||
| } | |||||||||||||
| if (rename_cow) { | |||||||||||||
| vm_page_assert_xbusied(fs->m); | |||||||||||||
| /* | /* | ||||||||||||
| * Remove but keep xbusy for replace. fs->m is moved into | * Remove but keep xbusy for replace. fs->m is moved into | ||||||||||||
| * fs->first_object and left busy while fs->first_m is | * fs->first_object and left busy while fs->first_m is | ||||||||||||
| * conditionally freed. | * conditionally freed. | ||||||||||||
Done Inline ActionsI don't quite follow this. If fs->m is xbusied, then we must have res != FAULT_SOFT_MSHAREDBUSY, so why do we handle the sharedbusy case specially above? markj: I don't quite follow this. If fs->m is xbusied, then we must have `res !=… | |||||||||||||
Done Inline ActionsIn fact fast_cow must not happen for MSHAREDBUSY case at all. kib: In fact fast_cow must not happen for MSHAREDBUSY case at all. | |||||||||||||
| */ | */ | ||||||||||||
| vm_page_remove_xbusy(fs->m); | vm_page_remove_xbusy(fs->m); | ||||||||||||
| vm_page_replace(fs->m, fs->first_object, fs->first_pindex, | vm_page_replace(fs->m, fs->first_object, fs->first_pindex, | ||||||||||||
| fs->first_m); | fs->first_m); | ||||||||||||
| vm_page_dirty(fs->m); | vm_page_dirty(fs->m); | ||||||||||||
| #if VM_NRESERVLEVEL > 0 | #if VM_NRESERVLEVEL > 0 | ||||||||||||
| /* | /* | ||||||||||||
| * Rename the reservation. | * Rename the reservation. | ||||||||||||
| Show All 36 Lines | #endif | ||||||||||||
| * | * | ||||||||||||
| * The flag check is racy, but this is tolerable: if | * The flag check is racy, but this is tolerable: if | ||||||||||||
| * OBJ_ONEMAPPING is cleared after the check, the busy state | * OBJ_ONEMAPPING is cleared after the check, the busy state | ||||||||||||
| * ensures that new mappings of m_cow can't be created. | * ensures that new mappings of m_cow can't be created. | ||||||||||||
| * pmap_enter() will replace an existing mapping in the current | * pmap_enter() will replace an existing mapping in the current | ||||||||||||
| * address space. If OBJ_ONEMAPPING is set after the check, | * address space. If OBJ_ONEMAPPING is set after the check, | ||||||||||||
| * removing mappings will at worse trigger some unnecessary page | * removing mappings will at worse trigger some unnecessary page | ||||||||||||
| * faults. | * faults. | ||||||||||||
| * | |||||||||||||
| * In the fs->m shared busy case, the xbusy state of | |||||||||||||
| * fs->first_m prevents new mappings of fs->m from | |||||||||||||
Done Inline ActionsI don't quite follow--what if m_cow->object is mapped directly into another process' address space? Oh, you are also checking m_cow->object->ref_count == 1. markj: I don't quite follow--what if m_cow->object is mapped directly into another process' address… | |||||||||||||
Done Inline ActionsI m->object is mapped directly, then what is the problem? Or rather, it is the same situation as before my patch. I do not change the cow handling, I need only to avoid the situation where cow would rename the page, and the page is shared-busied. So your question is interesting, but it is valid for the non-patched code as well, I believe. kib: I m->object is mapped directly, then what is the problem? Or rather, it is the same situation… | |||||||||||||
| * being created because a parallel fault on this | |||||||||||||
Done Inline Actions
markj: | |||||||||||||
Done Inline ActionsI think this explanation doesn't justify the code. The original problem scenario is that object O has a shadow object O', and O' is mapped by multiple processes. Then, suppose m_cow is mapped read-only into all of those processes. Suppose some process triggers a copy-on-write fault, causing m_cow to be copied into the shadow object. We must shoot down all of the mappings of m_cow, because it is fully shadowed and may be freed. I cannot see how that is prevented in the sbusy case. I also do not see how "the sbusy case is only possible when the shadow object is read-mapped" is true. Here we are handling a COW fault, so clearly the shadow object is mapped RW. markj: I think this explanation doesn't justify the code. The original problem scenario is that object… | |||||||||||||
Done Inline ActionsIf the cow fault is triggerred and the page is copied, it must be write-fault or cow-fault. Both types are checked and excluded in the condition in vm_fault_object(). kib: If the cow fault is triggerred and the page is copied, it must be write-fault or cow-fault. | |||||||||||||
Done Inline ActionsSorry, I don't really follow. They're only excluded in the case where fs->object == fs->first_object, from my reading, but this is not sufficient. There is a test case in tests/sys/vm/shared_shadow_inval_test.c which exercises this case and explains the problem scenario a bit more. I do not see any problems when testing it with the patch applied, but I will try to spend more time to construct a problematic test case or discover that I'm wrong about something. markj: Sorry, I don't really follow. They're only excluded in the case where `fs->object == fs… | |||||||||||||
Done Inline ActionsOk, I extracted the checkers into two dedicated helpers, hope this way it would be clearer. I verify that:
Then I sbusy fs->m, if the failt if for read, it is enough. kib: Ok, I extracted the checkers into two dedicated helpers, hope this way it would be clearer.
I… | |||||||||||||
| * shadow chain should wait for xbusy on fs->first_m. | |||||||||||||
Done Inline ActionsI think this observation can be used to simplify the whole comment. I can submit a patch for that after this work is landed. markj: I think this observation can be used to simplify the whole comment. I can submit a patch for… | |||||||||||||
| */ | */ | ||||||||||||
Done Inline Actions
markj: | |||||||||||||
| vm_page_assert_xbusied(fs->m_cow); | if (!vm_page_xbusied(fs->m_cow)) | ||||||||||||
Done Inline ActionsThis place is somewhat worrisome. fs->m_cow is only shared-busy. I can check that first_object->OBJ_ONEMAPPING is clear in the condition for shared-busy path. kib: This place is somewhat worrisome. fs->m_cow is only shared-busy. I can check that first_object… | |||||||||||||
Done Inline ActionsE.g. we can take the fltlock for the whole AS on vmspace_fork(), which would keep OBJ_ONEMAPPING stable for faults. kib: E.g. we can take the fltlock for the whole AS on vmspace_fork(), which would keep… | |||||||||||||
| if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0) | VM_OBJECT_UNLOCK(fs->object); | ||||||||||||
| else if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0) | |||||||||||||
| pmap_remove_all(fs->m_cow); | pmap_remove_all(fs->m_cow); | ||||||||||||
| } | } | ||||||||||||
Done Inline ActionsIf this is a sharedbusy soft fault, then before the unlock can we assert that
? markj: If this is a sharedbusy soft fault, then before the unlock can we assert that
1) there is at… | |||||||||||||
Done Inline Actions
But I do not see how could we assert this in MI code without adding a new pmap method. And I am not sure that adding such method right now is worth it. kib: 3. the mapping is at the address for this fault
But I do not see how could we assert this in… | |||||||||||||
| vm_object_pip_wakeup(fs->object); | vm_object_pip_wakeup(fs->object); | ||||||||||||
| /* | /* | ||||||||||||
| * Only use the new page below... | * Only use the new page below... | ||||||||||||
| */ | */ | ||||||||||||
| fs->object = fs->first_object; | fs->object = fs->first_object; | ||||||||||||
| fs->pindex = fs->first_pindex; | fs->pindex = fs->first_pindex; | ||||||||||||
| fs->m = fs->first_m; | fs->m = fs->first_m; | ||||||||||||
| ▲ Show 20 Lines • Show All 315 Lines • ▼ Show 20 Lines | |||||||||||||
| * | * | ||||||||||||
| * We can theoretically allow the busy case on a read fault if the page | * We can theoretically allow the busy case on a read fault if the page | ||||||||||||
| * is marked valid, but since such pages are typically already pmap'd, | * is marked valid, but since such pages are typically already pmap'd, | ||||||||||||
| * putting that special case in might be more effort then it is worth. | * putting that special case in might be more effort then it is worth. | ||||||||||||
| * We cannot under any circumstances mess around with a shared busied | * We cannot under any circumstances mess around with a shared busied | ||||||||||||
| * page except, perhaps, to pmap it. | * page except, perhaps, to pmap it. | ||||||||||||
| */ | */ | ||||||||||||
| static void | static void | ||||||||||||
| vm_fault_busy_sleep(struct faultstate *fs) | vm_fault_busy_sleep(struct faultstate *fs, int allocflags) | ||||||||||||
| { | { | ||||||||||||
| /* | /* | ||||||||||||
| * Reference the page before unlocking and | * Reference the page before unlocking and | ||||||||||||
| * sleeping so that the page daemon is less | * sleeping so that the page daemon is less | ||||||||||||
| * likely to reclaim it. | * likely to reclaim it. | ||||||||||||
| */ | */ | ||||||||||||
| vm_page_aflag_set(fs->m, PGA_REFERENCED); | vm_page_aflag_set(fs->m, PGA_REFERENCED); | ||||||||||||
| if (fs->object != fs->first_object) { | if (fs->object != fs->first_object) { | ||||||||||||
| vm_fault_page_release(&fs->first_m); | vm_fault_page_release(&fs->first_m); | ||||||||||||
| vm_object_pip_wakeup(fs->first_object); | vm_object_pip_wakeup(fs->first_object); | ||||||||||||
| } | } | ||||||||||||
| vm_object_pip_wakeup(fs->object); | vm_object_pip_wakeup(fs->object); | ||||||||||||
| vm_fault_unlock_map(fs); | vm_fault_unlock_map(fs); | ||||||||||||
| if (!vm_page_busy_sleep(fs->m, "vmpfw", 0)) | if (!vm_page_busy_sleep(fs->m, "vmpfw", allocflags)) | ||||||||||||
| VM_OBJECT_UNLOCK(fs->object); | VM_OBJECT_UNLOCK(fs->object); | ||||||||||||
| VM_CNT_INC(v_intrans); | VM_CNT_INC(v_intrans); | ||||||||||||
| vm_object_deallocate(fs->first_object); | vm_object_deallocate(fs->first_object); | ||||||||||||
| } | } | ||||||||||||
| /* | /* | ||||||||||||
| * Handle page lookup, populate, allocate, page-in for the current | * Handle page lookup, populate, allocate, page-in for the current | ||||||||||||
| * object. | * object. | ||||||||||||
| Show All 29 Lines | vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp) | ||||||||||||
| } | } | ||||||||||||
| /* | /* | ||||||||||||
| * See if the page is resident. | * See if the page is resident. | ||||||||||||
| */ | */ | ||||||||||||
| vm_page_iter_init(&pages, fs->object); | vm_page_iter_init(&pages, fs->object); | ||||||||||||
| fs->m = vm_radix_iter_lookup(&pages, fs->pindex); | fs->m = vm_radix_iter_lookup(&pages, fs->pindex); | ||||||||||||
| if (fs->m != NULL) { | if (fs->m != NULL) { | ||||||||||||
| /* | |||||||||||||
| * If the found page is valid, either will be shadowed | |||||||||||||
Done Inline Actions
markj: | |||||||||||||
| * or mapped for read, and would not be renamed, then | |||||||||||||
Done Inline Actions
markj: | |||||||||||||
Done Inline Actions
alc: | |||||||||||||
| * busy it in shared mode. This allows other faults | |||||||||||||
Done Inline Actions
Historically, we have written "COW", not "CoW". I only found one instance of the latter under vm/. alc: Historically, we have written "COW", not "CoW". I only found one instance of the latter under… | |||||||||||||
| * needing this page to proceed in parallel. | |||||||||||||
| * | |||||||||||||
Done Inline ActionsNote that pmap_enter() uses the VM mapping protections to decide whether to map a page RW, so even a read fault may install a RW mapping. In this case, creation of the first writable mapping for a page requires the page to be xbusied, since pmap_remove_write() requires only that the page be sbusy. So, I think this check isn't sufficient, you have to disallow sbusying when (fs->prot & VM_PROT_WRITE) != 0 && fs->object == fs->first_object. But then, maybe we can simplify the check and sbusy only when fs->object != fs->first_object? Soft faults in the top-level object are mostly handled by vm_fault_soft_fast(). markj: Note that pmap_enter() uses the VM mapping protections to decide whether to map a page RW, so… | |||||||||||||
Done Inline Actions
This is only for the text mapping. Data segment mappings are cow and would xbusy the vnode page, which is the core of the mjg' complain, I believe. Also vm_fault_soft_fast() bail out if the page is busy. This probably should be changed to only fail if the page is xbusy, I added such change to the patch set. I added the check for fs.prot & VM_PROT_WRITE to correct the issue you pointed out. kib: > Soft faults in the top-level object are mostly handled by vm_fault_soft_fast().
This is only… | |||||||||||||
| * Unlocked check for validity, rechecked after busy | |||||||||||||
| * is obtained. | |||||||||||||
| */ | |||||||||||||
| if (vm_page_all_valid(fs->m) && | |||||||||||||
| /* | |||||||||||||
| * No write permissions for new fs->m mapping, | |||||||||||||
| * or the first object has only one mapping so | |||||||||||||
Done Inline Actions
alc: | |||||||||||||
| * other writeable CoW mappings of fs->m cannot | |||||||||||||
Done Inline Actions
alc: | |||||||||||||
| * appear under us. | |||||||||||||
Done Inline Actions
alc: | |||||||||||||
| */ | |||||||||||||
| (vm_fault_is_read(fs) || vm_fault_fo_is_onemapping(fs)) && | |||||||||||||
| /* fs->m cannot be renamed from object to first_object. */ | |||||||||||||
Done Inline Actions
markj: | |||||||||||||
| (!vm_fault_can_cow_rename(fs) || | |||||||||||||
| fs->object != fs->first_object->backing_object)) { | |||||||||||||
Done Inline ActionsI believe the check here is racy, since fs->first_object is not locked. And the parentheses look wrong. markj: I believe the check here is racy, since `fs->first_object` is not locked. And the parentheses… | |||||||||||||
| if (!vm_page_trysbusy(fs->m)) { | |||||||||||||
| vm_fault_busy_sleep(fs, VM_ALLOC_SBUSY); | |||||||||||||
Done Inline ActionsWhy disallow shared busy after the first failed attempt? markj: Why disallow shared busy after the first failed attempt? | |||||||||||||
Done Inline ActionsI want(ed) to have the code path to fall back to pristine old way if sbusy failed. I think we can remove can_sbusy indeed. One note that vm_fault_busy_sleep() should be more flexible, I will add allocflags argument. kib: I want(ed) to have the code path to fall back to pristine old way if sbusy failed.
I think we… | |||||||||||||
| return (FAULT_RESTART); | |||||||||||||
| } | |||||||||||||
| /* | |||||||||||||
| * Now make sure that conditions checked by | |||||||||||||
| * race are still valid, in particular, that | |||||||||||||
Done Inline ActionsMaybe, "Now make sure that racily checked conditions are still valid" markj: Maybe, "Now make sure that racily checked conditions are still valid" | |||||||||||||
| * CoW OBJ_ONEMAPPING check is not occuring. | |||||||||||||
Done Inline ActionsI think this comment about ONEMAPPING is stale? markj: I think this comment about ONEMAPPING is stale? | |||||||||||||
| * Keep fs->object locked until vm_fault_cow() | |||||||||||||
Done Inline Actions
Since the condition was checked once already. markj: Since the condition was checked once already. | |||||||||||||
| * is done. | |||||||||||||
| */ | |||||||||||||
Done Inline ActionsSo in this case we will call vm_fault_busy_sleep(), but why? Why not unbusy the page and fall through to the vm_page_tryxbusy() below? markj: So in this case we will call vm_fault_busy_sleep(), but why? Why not unbusy the page and fall… | |||||||||||||
| if (__predict_true(vm_page_all_valid(fs->m)) && | |||||||||||||
Done Inline ActionsShouldn't the predict_true apply to the whole predicate? markj: Shouldn't the predict_true apply to the whole predicate? | |||||||||||||
| (vm_fault_is_read(fs) || | |||||||||||||
Done Inline Actions
markj: | |||||||||||||
| vm_fault_fo_is_onemapping(fs))) { | |||||||||||||
| return (FAULT_SOFT); | |||||||||||||
| } | |||||||||||||
| vm_page_sunbusy(fs->m); | |||||||||||||
| } | |||||||||||||
Done Inline ActionsI wonder if it's worth maintaining a counter here? markj: I wonder if it's worth maintaining a counter here? | |||||||||||||
Done Inline ActionsWhich counter? The count of times we tried to use sbusy but after relock found that conditions are not true? I do not see it much interesting. I had a debug part of this patch that counted successful sbusy cases, and then even more fine-grained list of conditions for each reason that sbusy cannot be used. I do not think it is very useful for either production or in-field debugging. kib: Which counter? The count of times we tried to use sbusy but after relock found that conditions… | |||||||||||||
Done Inline ActionsYes. It should be cheap to maintain and maybe it will reveal some unexpected phenomenon. But ok, it's probably not useful. markj: Yes. It should be cheap to maintain and maybe it will reveal some unexpected phenomenon. But ok… | |||||||||||||
| if (!vm_page_tryxbusy(fs->m)) { | if (!vm_page_tryxbusy(fs->m)) { | ||||||||||||
| vm_fault_busy_sleep(fs); | vm_fault_busy_sleep(fs, 0); | ||||||||||||
| return (FAULT_RESTART); | return (FAULT_RESTART); | ||||||||||||
| } | } | ||||||||||||
| /* | /* | ||||||||||||
| * The page is marked busy for other processes and the | * The page is marked busy for other processes and the | ||||||||||||
| * pagedaemon. If it is still completely valid we are | * pagedaemon. If it is still completely valid we are | ||||||||||||
| * done. | * done. | ||||||||||||
| */ | */ | ||||||||||||
| ▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines | vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp) | ||||||||||||
| } else { | } else { | ||||||||||||
| res = FAULT_CONTINUE; | res = FAULT_CONTINUE; | ||||||||||||
| } | } | ||||||||||||
| return (res); | return (res); | ||||||||||||
| } | } | ||||||||||||
| int | int | ||||||||||||
| vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, | vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, | ||||||||||||
| int fault_flags, vm_page_t *m_hold) | int fault_flags, vm_page_t *m_hold) | ||||||||||||
Done Inline ActionsI prefer just rangelocked. markj: I prefer just `rangelocked`. | |||||||||||||
| { | { | ||||||||||||
| struct pctrie_iter pages; | struct pctrie_iter pages; | ||||||||||||
| struct faultstate fs; | struct faultstate fs; | ||||||||||||
| int ahead, behind, faultcount, rv; | int ahead, behind, faultcount, rv; | ||||||||||||
| enum fault_status res; | enum fault_status res; | ||||||||||||
| enum fault_next_status res_next; | enum fault_next_status res_next; | ||||||||||||
| bool hardfault; | bool hardfault, object_locked; | ||||||||||||
| VM_CNT_INC(v_vm_faults); | VM_CNT_INC(v_vm_faults); | ||||||||||||
| if ((curthread->td_pflags & TDP_NOFAULTING) != 0) | if ((curthread->td_pflags & TDP_NOFAULTING) != 0) | ||||||||||||
| return (KERN_PROTECTION_FAILURE); | return (KERN_PROTECTION_FAILURE); | ||||||||||||
| fs.vp = NULL; | fs.vp = NULL; | ||||||||||||
| fs.vaddr = vaddr; | fs.vaddr = vaddr; | ||||||||||||
| ▲ Show 20 Lines • Show All 128 Lines • ▼ Show 20 Lines | while (TRUE) { | ||||||||||||
| VM_OBJECT_UNLOCK(fs.object); | VM_OBJECT_UNLOCK(fs.object); | ||||||||||||
| vm_fault_zerofill(&fs); | vm_fault_zerofill(&fs); | ||||||||||||
| /* Don't try to prefault neighboring pages. */ | /* Don't try to prefault neighboring pages. */ | ||||||||||||
| faultcount = 1; | faultcount = 1; | ||||||||||||
| break; | break; | ||||||||||||
| } | } | ||||||||||||
| found: | found: | ||||||||||||
| /* | /* | ||||||||||||
| * A valid page has been found and exclusively busied. The | * A valid page has been found and busied. The object lock | ||||||||||||
Done Inline ActionsThis comment is inaccurate now. markj: This comment is inaccurate now. | |||||||||||||
| * object lock must no longer be held. | * must no longer be held if the page was xbusied, but is kept | ||||||||||||
Done Inline ActionsThe comment about the object lock is false in the sbusy case. markj: The comment about the object lock is false in the sbusy case. | |||||||||||||
| * around until CoW is done for the sbusied case. | |||||||||||||
| * | |||||||||||||
| * Regardless of the busy state of fs.m, fs.first_m is always | |||||||||||||
| * exclusively busied after the first iteration of the loop | |||||||||||||
| * calling vm_fault_object(). This is an ordering point for | |||||||||||||
Not Done Inline Actions
While it would be highly unusual, the top-level object could be read-write-shared between address spaces, so I don't see the reason for including "in our vm_map" here. alc: While it would be highly unusual, the top-level object could be read-write-shared between… | |||||||||||||
| * the parallel faults occuring in our vm_map on the same | |||||||||||||
| * page. | |||||||||||||
Done Inline ActionsThis means that different processes faulting on the same top-level object will be serialized when faulting on the same page. So, if the top-level object is shared, i.e., OBJ_ONEMAPPING is clear, do we really need the extra synchronization from the fs.first_object read lock? It looks like we can just rely on the existing "racy" check in vm_fault_cow() after all. We just change the justification a little bit: instead of relying on the xbusy of m_cow to provide serialization, we rely on the xbusy of fs.first_m to ensure that a second process will not create a read-only pmap entry for m_cow. markj: This means that different processes faulting on the same top-level object will be serialized… | |||||||||||||
markjUnsubmitted Done Inline ActionsThe xbusy of first_m also serializes page faults through other mappings of the same top-level object. So, in the end maybe we do not need the object lock to synchronize the OBJ_ONEMAPPING check:
In other words, I think the argument in the comment "The flag check is racy, but this is tolerable..." can be changed to apply to the sbusy case too. Instead of using the xbusy state of m_cow to make the argument, we can use the xbusy state of first_m to argue that the racy check is safe. Then we do not need any special handling for the sbusy case. Am I missing something? markj: The xbusy of `first_m` also serializes page faults through other mappings of the same top-level… | |||||||||||||
kibAuthorUnsubmitted Done Inline ActionsBy the special handling for the sbusy case, do you mean my extent of the fs->object lock after the vm_fault_cow() is done? I believe yes, because you said in the second sentence that 'we do not need the object lock'. The fs->object locking purpose was not to handle the OBJ_ONEMAPPING complication. There are checks in vm_fault_object() that should prevent vm_fault_cow() from copying the page from fs->object to fs->first_object, done by vm_fault_can_cow_rename(). I need a way to ensure that the invariants are kept true between the check in vm_fault_object() and the action in vm_fault_cow(). I believe that object collapse might break this. Even if it cannot now, still the checked conditions (refcount != 1 or shadow_count != 1) are delicate enough that collapse implementation should be allowed enough freedom so we do not need to rely on its internals to keep that true. kib: By the special handling for the sbusy case, do you mean my extent of the fs->object lock after… | |||||||||||||
markjUnsubmitted Done Inline Actions
Right.
Ok, but really my point was that there is no OBJ_ONEMAPPING complication, or so I believe. That is, I think the sbusy case does not need to check (fs.first_object->flags & OBJ_ONEMAPPING) != 0. Or, I do not understand the justification for checking it.
Why not perform the checks racily, and disallow renaming if !vm_page_xbusied(fs->m)?
Even today, the ref_count == 1 and shadow_count == 1 checks are done without the object lock held. I can't remember offhand why that is safe. markj: > By the special handling for the sbusy case, do you mean my extent of the fs->object lock… | |||||||||||||
kibAuthorUnsubmitted Done Inline Actions
Ok, I reduced the vm_fault_fo_is_onemapping() to vm_fault_might_be_cow(). Despite trivial, it is useful IMO, all places where I replaced direct object != first_object checks become easier to understand.
Ok.
I added the re-check after the locks are acquired. kib: > > The fs->object locking purpose was not to handle the OBJ_ONEMAPPING complication.
>
> Ok… | |||||||||||||
| */ | */ | ||||||||||||
| vm_page_assert_xbusied(fs.m); | if (vm_page_xbusied(fs.m)) { | ||||||||||||
| object_locked = false; | |||||||||||||
| VM_OBJECT_ASSERT_UNLOCKED(fs.object); | VM_OBJECT_ASSERT_UNLOCKED(fs.object); | ||||||||||||
| } else { | |||||||||||||
| vm_page_assert_busied(fs.m); | |||||||||||||
| object_locked = true; | |||||||||||||
Done Inline Actionsobject_locked would be a clearer name for this flag IMO. markj: `object_locked` would be a clearer name for this flag IMO. | |||||||||||||
| VM_OBJECT_ASSERT_LOCKED(fs.object); | |||||||||||||
| } | |||||||||||||
| /* | /* | ||||||||||||
| * If the page is being written, but isn't already owned by the | * If the page is being written, but isn't already owned by the | ||||||||||||
| * top-level object, we have to copy it into a new page owned by the | * top-level object, we have to copy it into a new page owned by the | ||||||||||||
| * top-level object. | * top-level object. | ||||||||||||
| */ | */ | ||||||||||||
| if (fs.object != fs.first_object) { | if (fs.object != fs.first_object) { | ||||||||||||
| /* | /* | ||||||||||||
Done Inline ActionsExtra newline markj: Extra newline | |||||||||||||
| * We only really need to copy if we want to write it. | * We only really need to copy if we want to write it. | ||||||||||||
| */ | */ | ||||||||||||
| if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { | if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { | ||||||||||||
| vm_fault_cow(&fs); | vm_fault_cow(&fs, res); | ||||||||||||
| object_locked = false; | |||||||||||||
| /* | /* | ||||||||||||
| * We only try to prefault read-only mappings to the | * We only try to prefault read-only mappings to the | ||||||||||||
| * neighboring pages when this copy-on-write fault is | * neighboring pages when this copy-on-write fault is | ||||||||||||
| * a hard fault. In other cases, trying to prefault | * a hard fault. In other cases, trying to prefault | ||||||||||||
| * is typically wasted effort. | * is typically wasted effort. | ||||||||||||
| */ | */ | ||||||||||||
| if (faultcount == 0) | if (faultcount == 0) | ||||||||||||
| faultcount = 1; | faultcount = 1; | ||||||||||||
| } else { | } else { | ||||||||||||
| fs.prot &= ~VM_PROT_WRITE; | fs.prot &= ~VM_PROT_WRITE; | ||||||||||||
| } | } | ||||||||||||
| } | } | ||||||||||||
| if (object_locked) | |||||||||||||
| VM_OBJECT_UNLOCK(fs.object); | |||||||||||||
| /* | /* | ||||||||||||
| * We must verify that the maps have not changed since our last | * We must verify that the maps have not changed since our last | ||||||||||||
| * lookup. | * lookup. | ||||||||||||
Done Inline ActionsSuppose two threads fault on the same virtual page, one is a read fault, one is a write fault. Suppose the page is COW and the source page is valid. After this change, what stops them from racing? In particular, how do we ensure that the new top-level page is entered into the pmap? markj: Suppose two threads fault on the same virtual page, one is a read fault, one is a write fault. | |||||||||||||
Done Inline ActionsWell, the read fault is impossible, because the fault type is defined by the mapping permissions, not just the hardware faulting mode. So if it is the same virtual page, then both faults would proceed this as a write fault. kib: Well, the read fault is impossible, because the fault type is defined by the mapping… | |||||||||||||
Done Inline ActionsSorry, I don't understand. Suppose the MAP_ENTRY_NEEDS_COPY is clear and there is already a shadow object O with backing object O'. Suppose there is some resident page m in O' that is not mapped into the process. Suppose there is no resident page at that address in O. Let the write-faulting thread be T1, and the read-faulting thread be T2. Then: T2:
T1:
T2:
Now, subsequent reads of the page will not show the modification done by T1. markj: Sorry, I don't understand. Suppose the MAP_ENTRY_NEEDS_COPY is clear and there is already a… | |||||||||||||
Done Inline Actionskib pointed out on IRC that these threads are serialized by the xbusying the top-level page. I forgot that we hold the xbusy lock for the entire duration of the fault, even for read faults. markj: kib pointed out on IRC that these threads are serialized by the xbusying the top-level page. I… | |||||||||||||
| */ | */ | ||||||||||||
| if (!fs.lookup_still_valid) { | if (!fs.lookup_still_valid) { | ||||||||||||
| rv = vm_fault_relookup(&fs); | rv = vm_fault_relookup(&fs); | ||||||||||||
| if (rv != KERN_SUCCESS) { | if (rv != KERN_SUCCESS) { | ||||||||||||
| vm_fault_deallocate(&fs); | vm_fault_deallocate(&fs); | ||||||||||||
| if (rv == KERN_RESTART) | if (rv == KERN_RESTART) | ||||||||||||
| goto RetryFault; | goto RetryFault; | ||||||||||||
| return (rv); | return (rv); | ||||||||||||
| Show All 22 Lines | KASSERT(vm_page_none_valid(fs.m), | ||||||||||||
| ("vm_fault: page %p is already valid", fs.m_cow)); | ("vm_fault: page %p is already valid", fs.m_cow)); | ||||||||||||
| vm_page_valid(fs.m); | vm_page_valid(fs.m); | ||||||||||||
| } | } | ||||||||||||
| /* | /* | ||||||||||||
| * Page must be completely valid or it is not fit to | * Page must be completely valid or it is not fit to | ||||||||||||
| * map into user space. vm_pager_get_pages() ensures this. | * map into user space. vm_pager_get_pages() ensures this. | ||||||||||||
| */ | */ | ||||||||||||
| vm_page_assert_xbusied(fs.m); | vm_page_assert_busied(fs.m); | ||||||||||||
| KASSERT(vm_page_all_valid(fs.m), | KASSERT(vm_page_all_valid(fs.m), | ||||||||||||
| ("vm_fault: page %p partially invalid", fs.m)); | ("vm_fault: page %p partially invalid", fs.m)); | ||||||||||||
| vm_fault_dirty(&fs, fs.m); | vm_fault_dirty(&fs, fs.m); | ||||||||||||
| /* | /* | ||||||||||||
| * Put this page into the physical map. We had to do the unlock above | * Put this page into the physical map. We had to do the unlock above | ||||||||||||
| * because pmap_enter() may sleep. We don't put the page | * because pmap_enter() may sleep. We don't put the page | ||||||||||||
| Show All 15 Lines | found: | ||||||||||||
| if ((fs.fault_flags & VM_FAULT_WIRE) != 0) | if ((fs.fault_flags & VM_FAULT_WIRE) != 0) | ||||||||||||
| vm_page_wire(fs.m); | vm_page_wire(fs.m); | ||||||||||||
| else | else | ||||||||||||
| vm_page_activate(fs.m); | vm_page_activate(fs.m); | ||||||||||||
| if (fs.m_hold != NULL) { | if (fs.m_hold != NULL) { | ||||||||||||
| (*fs.m_hold) = fs.m; | (*fs.m_hold) = fs.m; | ||||||||||||
| vm_page_wire(fs.m); | vm_page_wire(fs.m); | ||||||||||||
| } | } | ||||||||||||
| KASSERT(fs.first_object == fs.object || vm_page_xbusied(fs.first_m), | |||||||||||||
| ("first_m must be xbusy")); | |||||||||||||
| if (vm_page_xbusied(fs.m)) | |||||||||||||
Done Inline ActionsCan we also assert something like, fs.first_m == fs.m || vm_page_xbusied(fs.first_m) here? markj: Can we also assert something like, `fs.first_m == fs.m || vm_page_xbusied(fs.first_m)` here? | |||||||||||||
| vm_page_xunbusy(fs.m); | vm_page_xunbusy(fs.m); | ||||||||||||
| else | |||||||||||||
| vm_page_sunbusy(fs.m); | |||||||||||||
| fs.m = NULL; | fs.m = NULL; | ||||||||||||
| /* | /* | ||||||||||||
| * Unlock everything, and return | * Unlock everything, and return | ||||||||||||
| */ | */ | ||||||||||||
| vm_fault_deallocate(&fs); | vm_fault_deallocate(&fs); | ||||||||||||
| if (hardfault) { | if (hardfault) { | ||||||||||||
| VM_CNT_INC(v_io_faults); | VM_CNT_INC(v_io_faults); | ||||||||||||
| Show All 26 Lines | |||||||||||||
| * the faulting pindex by the cluster size when the pages read by vm_fault() | * the faulting pindex by the cluster size when the pages read by vm_fault() | ||||||||||||
| * cross a cluster-size boundary. The cluster size is the greater of the | * cross a cluster-size boundary. The cluster size is the greater of the | ||||||||||||
| * smallest superpage size and VM_FAULT_DONTNEED_MIN. | * smallest superpage size and VM_FAULT_DONTNEED_MIN. | ||||||||||||
| * | * | ||||||||||||
| * When "fs->first_object" is a shadow object, the pages in the backing object | * When "fs->first_object" is a shadow object, the pages in the backing object | ||||||||||||
| * that precede the faulting pindex are deactivated by vm_fault(). So, this | * that precede the faulting pindex are deactivated by vm_fault(). So, this | ||||||||||||
| * function must only be concerned with pages in the first object. | * function must only be concerned with pages in the first object. | ||||||||||||
| */ | */ | ||||||||||||
| static void | static void | ||||||||||||
Done Inline ActionsWhat exactly is the rangelock synchronizing? markj: What exactly is the rangelock synchronizing? | |||||||||||||
Done Inline ActionsTwo threads faulting on the same address on the same address space. We might end up double-counting wire count e.g. Before the patch, xbusy serialized them. kib: Two threads faulting on the same address on the same address space. We might end up double… | |||||||||||||
Done Inline ActionsThis requires a comment IMO. I understand that the lock prevents two threads from faulting on the same page, but it's not at all clear why this is needed. I'm also not sure what double-counting you're referring to. Double wiring from VM_FAULT_WIRE? markj: This requires a comment IMO. I understand that the lock prevents two threads from faulting on… | |||||||||||||
Done Inline ActionsVM_FAULT_WIRE must occur only once from vm_fault() call from vm_map_wire(). And from my re-reading of the code, vm_fault() is careful to not wire the page when spurious call to vm_fault() occur on the wired entry. I also failed to see other reasons for disallowing faults on the same address. At worst we would do pmap_enter()s in parallel which should be handled by pmap. So I will drop the rangelock, after all. kib: VM_FAULT_WIRE must occur only once from vm_fault() call from vm_map_wire(). And from my re… | |||||||||||||
| vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) | vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) | ||||||||||||
| { | { | ||||||||||||
| struct pctrie_iter pages; | struct pctrie_iter pages; | ||||||||||||
| vm_map_entry_t entry; | vm_map_entry_t entry; | ||||||||||||
| vm_object_t first_object; | vm_object_t first_object; | ||||||||||||
| vm_offset_t end, start; | vm_offset_t end, start; | ||||||||||||
| vm_page_t m; | vm_page_t m; | ||||||||||||
| vm_size_t size; | vm_size_t size; | ||||||||||||
| ▲ Show 20 Lines • Show All 465 Lines • Show Last 20 Lines | |||||||||||||
Rather than adding a new result and passing that around, can we instead add a new page_sbusied flag to the fault state? Or, can we just use vm_page_xbusied(fs.m) to check for this case instead of adding new state?