Index: sys/arm64/arm64/pmap.c =================================================================== --- sys/arm64/arm64/pmap.c +++ sys/arm64/arm64/pmap.c @@ -217,6 +217,14 @@ #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) +/* + * The presence of this flag indicates that the mapping is writeable. + * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is + * dirty. This bit should be preemptively set on unmanaged mappings to avoid + * unnecessary faults. + */ +static pt_entry_t ATTR_SW_DBM; + struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ @@ -315,11 +323,10 @@ * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ -#define pmap_clear(table) atomic_store_64(table, 0) -#define pmap_load_store(table, entry) atomic_swap_64(table, entry) -#define pmap_set(table, mask) atomic_set_64(table, mask) -#define pmap_load_clear(table) atomic_swap_64(table, 0) -#define pmap_load(table) (*table) +#define pmap_clear(table) atomic_store_64(table, 0) +#define pmap_load_store(table, entry) atomic_swap_64(table, entry) +#define pmap_load_clear(table) atomic_swap_64(table, 0) +#define pmap_load(table) (*table) /********************/ /* Inline functions */ @@ -531,16 +538,15 @@ CTASSERT(L1_BLOCK == L2_BLOCK); -/* - * Checks if the page is dirty. We currently lack proper tracking of this on - * arm64 so for now assume is a page mapped as rw was accessed it is. - */ static inline int pmap_pte_dirty(pt_entry_t pte) { - return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) == - (ATTR_AF | ATTR_AP(ATTR_AP_RW))); + KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0, + ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); + + return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == + (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM)); } static __inline void @@ -645,7 +651,8 @@ l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_load_store(&l2[l2_slot], - (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | + (pa & ~L2_OFFSET) | ATTR_DEFAULT | + ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), @@ -657,7 +664,8 @@ pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_load_store(&pagetable_dmap[l1_slot], - (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | + (pa & ~L1_OFFSET) | ATTR_DEFAULT | + ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } @@ -682,7 +690,8 @@ pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_load_store(&l2[l2_slot], - (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | + (pa & ~L2_OFFSET) | ATTR_DEFAULT | + ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } } @@ -765,13 +774,18 @@ vm_size_t kernlen) { u_int l1_slot, l2_slot; - uint64_t kern_delta; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; + uint64_t kern_delta, reg; int i; + /* Determine whether the hardware implements DBM management. */ + reg = READ_SPECIALREG(ID_AA64MMFR1_EL1); + ATTR_SW_DBM = ID_AA64MMFR1_HAFDBS(reg) == ID_AA64MMFR1_HAFDBS_AF_DBS ? + ATTR_DBM : _ATTR_SW_DBM; + kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); @@ -1168,7 +1182,8 @@ KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); - attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; + attr = ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM | ATTR_IDX(mode) | + L3_PAGE; if (mode == DEVICE_MEMORY) attr |= ATTR_XN; @@ -1286,7 +1301,7 @@ m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | - ATTR_IDX(m->md.pv_memattr) | L3_PAGE; + ATTR_SW_DBM | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; if (m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; pte = pmap_l2_to_l3(pde, va); @@ -2478,7 +2493,7 @@ /* * pmap_remove_l3: do the things to unmap a page in a process */ -static int +static int __unused pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { @@ -2788,7 +2803,8 @@ * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void -pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t nbits) +pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, + pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; @@ -2804,7 +2820,8 @@ * Return if the L2 entry already has the desired access restrictions * in place. */ - if ((old_l2 | nbits) == old_l2) +retry: + if ((old_l2 & mask) == nbits) return; /* @@ -2812,15 +2829,15 @@ * update the dirty field of each of the superpage's constituent 4KB * pages. */ - if ((nbits & ATTR_AP(ATTR_AP_RO)) != 0 && - (old_l2 & ATTR_SW_MANAGED) != 0 && - pmap_pte_dirty(old_l2)) { + if ((old_l2 & ATTR_SW_MANAGED) != 0 && + (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } - pmap_set(l2, nbits); + if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) + goto retry; /* * Since a promotion must break the 4KB page mappings before making @@ -2838,7 +2855,7 @@ { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; - pt_entry_t *l3p, l3, nbits; + pt_entry_t *l3p, l3, mask, nbits; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { @@ -2846,12 +2863,16 @@ return; } - nbits = 0; - if ((prot & VM_PROT_WRITE) == 0) + mask = nbits = 0; + if ((prot & VM_PROT_WRITE) == 0) { + mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_AP(ATTR_AP_RO); - if ((prot & VM_PROT_EXECUTE) == 0) + } + if ((prot & VM_PROT_EXECUTE) == 0) { + mask |= ATTR_XN; nbits |= ATTR_XN; - if (nbits == 0) + } + if (mask == 0) return; PMAP_LOCK(pmap); @@ -2883,7 +2904,7 @@ if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { - pmap_protect_l2(pmap, l2, sva, nbits); + pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; @@ -2897,6 +2918,8 @@ va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { + l3 = pmap_load(l3p); +retry: /* * Go to the next L3 entry if the current one is * invalid or already has the desired access @@ -2905,27 +2928,27 @@ * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ - l3 = pmap_load(l3p); - if (!pmap_l3_valid(l3) || (l3 | nbits) == l3) { + if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } - if (va == va_next) - va = sva; - /* - * When a dirty read/write mapping is write protected, - * update the page's dirty field. - */ - if ((nbits & ATTR_AP(ATTR_AP_RO)) != 0 && - (l3 & ATTR_SW_MANAGED) != 0 && + /* + * When a dirty read/write mapping is write protected, + * update the page's dirty field. + */ + if ((l3 & ATTR_SW_MANAGED) != 0 && + (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); - pmap_set(l3p, nbits); + if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) + goto retry; + if (va == va_next) + va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); @@ -3145,7 +3168,11 @@ pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); - if ((prot & VM_PROT_WRITE) == 0) + if ((prot & VM_PROT_WRITE) != 0) + new_l3 |= ATTR_SW_DBM; + if ((flags & VM_PROT_WRITE) != 0) + new_l3 |= ATTR_AP(ATTR_AP_RW); + else new_l3 |= ATTR_AP(ATTR_AP_RO); if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l3 |= ATTR_XN; @@ -3342,6 +3369,7 @@ KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ + /* XXXMJ need to reload orig_l3 for hardware DBM. */ pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if (pmap_pte_dirty(orig_l3) && @@ -3695,7 +3723,7 @@ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); - l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | + l3_val = pa | (ATTR_DEFAULT & ~ATTR_AF) | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) l3_val |= ATTR_XN; @@ -3842,7 +3870,7 @@ struct rwlock *lock; struct spglist free; pd_entry_t *l0, *l1, *l2, srcptepaddr; - pt_entry_t *dst_pte, ptetemp, *src_pte; + pt_entry_t *dst_pte, mask, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_l2pg, dstmpte, srcmpte; @@ -3893,8 +3921,10 @@ ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { - (void)pmap_load_store(l2, srcptepaddr & - ~ATTR_SW_WIRED); + mask = ATTR_AF | ATTR_SW_WIRED; + if ((srcptepaddr & ATTR_SW_DBM) != 0) + mask |= ATTR_AP_RW_BIT; + (void)pmap_load_store(l2, srcptepaddr & ~mask); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); @@ -3938,11 +3968,11 @@ /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. - * - * XXX not yet */ - (void)pmap_load_store(dst_pte, ptetemp & - ~ATTR_SW_WIRED); + mask = ATTR_AF | ATTR_SW_WIRED; + if ((ptetemp & ATTR_SW_DBM) != 0) + mask |= ATTR_AP_RW_BIT; + (void)pmap_load_store(dst_pte, ptetemp & ~mask); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); @@ -4293,8 +4323,7 @@ /* * Update the vm_page_t clean/reference bits. */ - if ((tpte & ATTR_AP_RW_BIT) == - ATTR_AP(ATTR_AP_RW)) { + if (pmap_pte_dirty(tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) @@ -4569,7 +4598,7 @@ } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); - if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) + if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", @@ -4592,13 +4621,14 @@ } } pte = pmap_pte(pmap, pv->pv_va, &lvl); -retry: oldpte = pmap_load(pte); - if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { - if (!atomic_cmpset_long(pte, oldpte, - oldpte | ATTR_AP(ATTR_AP_RO))) +retry: + if ((oldpte & ATTR_SW_DBM) != 0) { + if (!atomic_fcmpset_long(pte, &oldpte, + (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM)) goto retry; - if ((oldpte & ATTR_AF) != 0) + if ((oldpte & ATTR_AP(ATTR_AP_RW)) == + ATTR_AP(ATTR_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } @@ -4608,13 +4638,6 @@ vm_page_aflag_clear(m, PGA_WRITEABLE); } -static __inline boolean_t -safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) -{ - - return (FALSE); -} - /* * pmap_ts_referenced: * @@ -4640,12 +4663,10 @@ struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; - pt_entry_t *l3; vm_offset_t va; vm_paddr_t pa; - int cleared, md_gen, not_cleared, lvl, pvh_gen; + int cleared, md_gen, lvl, pvh_gen; struct spglist free; - bool demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); @@ -4656,7 +4677,6 @@ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: - not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; @@ -4692,63 +4712,29 @@ */ vm_page_dirty(m); } - if ((tpte & ATTR_AF) != 0) { - /* - * Since this reference bit is shared by 512 4KB - * pages, it should not be cleared every time it is - * tested. Apply a simple "hash" function on the - * physical page number, the virtual superpage number, - * and the pmap address to select one 4KB page out of - * the 512 on which testing the reference bit will - * result in clearing that reference bit. This - * function is designed to avoid the selection of the - * same 4KB page for every 2MB page mapping. - * - * On demotion, a mapping that hasn't been referenced - * is simply destroyed. To avoid the possibility of a - * subsequent page fault on a demoted wired mapping, - * always leave its reference bit set. Moreover, - * since the superpage is wired, the current state of - * its reference bit won't affect page replacement. - */ - if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ - (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && - (tpte & ATTR_SW_WIRED) == 0) { - if (safe_to_clear_referenced(pmap, tpte)) { - /* - * TODO: We don't handle the access - * flag at all. We need to be able - * to set it in the exception handler. - */ - panic("ARM64TODO: " - "safe_to_clear_referenced\n"); - } else if (pmap_demote_l2_locked(pmap, pte, - pv->pv_va, &lock) != NULL) { - demoted = true; - va += VM_PAGE_TO_PHYS(m) - - (tpte & ~ATTR_MASK); - l3 = pmap_l2_to_l3(pte, va); - pmap_remove_l3(pmap, l3, va, - pmap_load(pte), NULL, &lock); - } else - demoted = true; - - if (demoted) { - /* - * The superpage mapping was removed - * entirely and therefore 'pv' is no - * longer valid. - */ - if (pvf == pv) - pvf = NULL; - pv = NULL; - } - cleared++; - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); - } else - not_cleared++; + + /* + * Since this reference bit is shared by 512 4KB pages, it + * should not be cleared every time it is tested. Apply a + * simple "hash" function on the physical page number, the + * virtual superpage number, and the pmap address to select one + * 4KB page out of the 512 on which testing the reference bit + * will result in clearing that reference bit. This function is + * designed to avoid the selection of the same 4KB page for + * every 2MB page mapping. + * + * On demotion, a mapping that hasn't been referenced is simply + * destroyed. To avoid the possibility of a subsequent page + * fault on a demoted wired mapping, always leave its reference + * bit set. Moreover, since the superpage is wired, the current + * state of its reference bit won't affect page replacement. + */ + if ((tpte & ATTR_AF) != 0 && (((pa >> PAGE_SHIFT) ^ + (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & + (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { + atomic_clear_64(pte, ATTR_AF); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ @@ -4757,7 +4743,7 @@ TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } - if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) + if (cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: @@ -4791,31 +4777,9 @@ if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { - if (safe_to_clear_referenced(pmap, tpte)) { - /* - * TODO: We don't handle the access flag - * at all. We need to be able to set it in - * the exception handler. - */ - panic("ARM64TODO: safe_to_clear_referenced\n"); - } else if ((tpte & ATTR_SW_WIRED) == 0) { - /* - * Wired pages cannot be paged out so - * doing accessed bit emulation for - * them is wasted effort. We do the - * hard work for unwired pages only. - */ - pmap_remove_l3(pmap, pte, pv->pv_va, tpde, - &free, &lock); - cleared++; - if (pvf == pv) - pvf = NULL; - pv = NULL; - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); - } else - not_cleared++; + atomic_clear_64(pte, ATTR_AF); + pmap_invalidate_page(pmap, pv->pv_va); + cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ @@ -4824,12 +4788,12 @@ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } - } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + - not_cleared < PMAP_TS_REFERENCED_MAX); + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared < + PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); - return (cleared + not_cleared); + return (cleared); } /* @@ -4848,6 +4812,14 @@ void pmap_clear_modify(vm_page_t m) { + struct md_page *pvh; + struct rwlock *lock; + pmap_t pmap; + pv_entry_t next_pv, pv; + pd_entry_t *l2, oldl2; + pt_entry_t *l3, oldl3; + vm_offset_t va; + int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); @@ -4856,14 +4828,81 @@ ("pmap_clear_modify: page %p is exclusive busied", m)); /* - * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. - * If the object containing the page is locked and the page is not + * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM + * set. If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; - - /* ARM64TODO: We lack support for tracking if a page is modified */ + pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : + pa_to_pvh(VM_PAGE_TO_PHYS(m)); + lock = VM_PAGE_TO_PV_LIST_LOCK(m); + rw_wlock(lock); +restart: + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + va = pv->pv_va; + l2 = pmap_l2(pmap, va); + oldl2 = pmap_load(l2); + if ((oldl2 & ATTR_SW_DBM) != 0) { + if (pmap_demote_l2_locked(pmap, l2, va, &lock)) { + if ((oldl2 & ATTR_SW_WIRED) == 0) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - + (oldl2 & ~ATTR_MASK); + l3 = pmap_l2_to_l3(l2, va); + oldl3 = pmap_load(l3); + if (pmap_l3_valid(oldl3)) { + while (!atomic_fcmpset_long(l3, + &oldl3, (oldl3 & ~ATTR_SW_DBM) | + ATTR_AP(ATTR_AP_RO))) + cpu_spinwait(); + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); + } + } + } + } + PMAP_UNLOCK(pmap); + } + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + if (!PMAP_TRYLOCK(pmap)) { + md_gen = m->md.pv_gen; + pvh_gen = pvh->pv_gen; + rw_wunlock(lock); + PMAP_LOCK(pmap); + rw_wlock(lock); + if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { + PMAP_UNLOCK(pmap); + goto restart; + } + } + l2 = pmap_l2(pmap, pv->pv_va); + l3 = pmap_l2_to_l3(l2, pv->pv_va); + oldl3 = pmap_load(l3); + if (pmap_l3_valid(oldl3) && + (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { + atomic_clear_64(l3, ATTR_AP_RW_BIT); + pmap_invalidate_page(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + rw_wunlock(lock); } void * @@ -4942,8 +4981,8 @@ /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, - pa | ATTR_DEFAULT | ATTR_XN | - ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); + pa | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM | + ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; @@ -5567,22 +5606,54 @@ int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { -#ifdef SMP + pt_entry_t *pte; register_t intr; - uint64_t par; + uint64_t ec, par; + int lvl, rv; - switch (ESR_ELx_EXCEPTION(esr)) { + rv = KERN_FAILURE; + + ec = ESR_ELx_EXCEPTION(esr); + switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: - return (KERN_FAILURE); + return (rv); } - /* Data and insn aborts use same encoding for FCS field. */ + /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { + case ISS_DATA_DFSC_AFF_L1: + case ISS_DATA_DFSC_AFF_L2: + case ISS_DATA_DFSC_AFF_L3: + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, far, &lvl); + if (pte != NULL && (pmap_load(pte) & ATTR_AF) == 0) { + atomic_set_64(pte, ATTR_AF); + rv = KERN_SUCCESS; + } + PMAP_UNLOCK(pmap); + break; + case ISS_DATA_DFSC_PF_L1: + case ISS_DATA_DFSC_PF_L2: + case ISS_DATA_DFSC_PF_L3: + if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || + (esr & ISS_DATA_WnR) == 0) + return (rv); + PMAP_LOCK(pmap); + pte = pmap_pte(pmap, far, &lvl); + if (pte != NULL && + (pmap_load(pte) & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == + (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { + atomic_clear_64(pte, ATTR_AP_RW_BIT); + pmap_invalidate_page(pmap, trunc_page(far)); + rv = KERN_SUCCESS; + } + PMAP_UNLOCK(pmap); + break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: @@ -5603,14 +5674,11 @@ * return success to the trap handler. */ if (PAR_SUCCESS(par)) - return (KERN_SUCCESS); - break; - default: + rv = KERN_SUCCESS; break; } -#endif - return (KERN_FAILURE); + return (rv); } /* Index: sys/arm64/arm64/trap.c =================================================================== --- sys/arm64/arm64/trap.c +++ sys/arm64/arm64/trap.c @@ -192,32 +192,16 @@ } /* - * The call to pmap_fault can be dangerous when coming from the - * kernel as it may be not be able to lock the pmap to check if - * the address is now valid. Because of this we filter the cases - * when we are not going to see superpage activity. + * Try to handle translation, access flag, and permission faults. + * Translation faults may occur as a result of the required + * break-before-make sequence used when promoting or demoting + * superpages. Such faults must not occur while holding the pmap lock, + * or pmap_fault() will recurse on that lock. */ - if (!lower) { - /* - * We may fault in a DMAP region due to a superpage being - * unmapped when the access took place. - */ - if (map == kernel_map && !VIRT_IN_DMAP(far)) - goto no_pmap_fault; - /* - * We can also fault in the userspace handling functions, - * e.g. copyin. In these cases we will have set a fault - * handler so we can check if this is set before calling - * pmap_fault. - */ - if (map != kernel_map && pcb->pcb_onfault == 0) - goto no_pmap_fault; - } - - if (pmap_fault(map->pmap, esr, far) == KERN_SUCCESS) + if ((lower || map == kernel_map || pcb->pcb_onfault != 0) && + pmap_fault(map->pmap, esr, far) == KERN_SUCCESS) return; -no_pmap_fault: KASSERT(td->td_md.md_spinlock_count == 0, ("data abort with spinlock held")); if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | @@ -229,9 +213,11 @@ } va = trunc_page(far); - ftype = ((esr >> 6) & 1) ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ; if (exec) - ftype |= VM_PROT_EXECUTE; + ftype = VM_PROT_EXECUTE; + else + ftype = (esr & ISS_DATA_WnR) == 0 ? VM_PROT_READ : + VM_PROT_READ | VM_PROT_WRITE; /* Fault in the page. */ error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); Index: sys/arm64/include/armreg.h =================================================================== --- sys/arm64/include/armreg.h +++ sys/arm64/include/armreg.h @@ -91,10 +91,10 @@ #define ISS_DATA_SF (0x01 << 15) #define ISS_DATA_AR (0x01 << 14) #define ISS_DATA_FnV (0x01 << 10) -#define ISS_DATa_EA (0x01 << 9) -#define ISS_DATa_CM (0x01 << 8) -#define ISS_INSN_S1PTW (0x01 << 7) -#define ISS_DATa_WnR (0x01 << 6) +#define ISS_DATA_EA (0x01 << 9) +#define ISS_DATA_CM (0x01 << 8) +#define ISS_DATA_S1PTW (0x01 << 7) +#define ISS_DATA_WnR (0x01 << 6) #define ISS_DATA_DFSC_MASK (0x3f << 0) #define ISS_DATA_DFSC_ASF_L0 (0x00 << 0) #define ISS_DATA_DFSC_ASF_L1 (0x01 << 0) Index: sys/arm64/include/pte.h =================================================================== --- sys/arm64/include/pte.h +++ sys/arm64/include/pte.h @@ -39,11 +39,12 @@ #endif /* Block and Page attributes */ -/* TODO: Add the upper attributes */ #define ATTR_MASK_H UINT64_C(0xfff0000000000000) #define ATTR_MASK_L UINT64_C(0x0000000000000fff) #define ATTR_MASK (ATTR_MASK_H | ATTR_MASK_L) /* Bits 58:55 are reserved for software */ +#define ATTR_SW_UNUSED (1UL << 58) +#define _ATTR_SW_DBM (1UL << 57) #define ATTR_SW_MANAGED (1UL << 56) #define ATTR_SW_WIRED (1UL << 55) #define ATTR_UXN (1UL << 54)