Changeset View
Standalone View
sys/amd64/amd64/pmap.c
Show First 20 Lines • Show All 91 Lines • ▼ Show 20 Lines | |||||
PG_RW = pmap_rw_bit(pmap); | PG_RW = pmap_rw_bit(pmap); | ||||
PG_PKU_MASK = pmap_pku_mask_bit(pmap); | PG_PKU_MASK = pmap_pku_mask_bit(pmap); | ||||
PG_PTE_CACHE = pmap_cache_mask(pmap, 0); | PG_PTE_CACHE = pmap_cache_mask(pmap, 0); | ||||
PMAP_LOCK_ASSERT(pmap, MA_OWNED); | PMAP_LOCK_ASSERT(pmap, MA_OWNED); | ||||
/* | /* | ||||
* Examine the first PTE in the specified PTP. Abort if this PTE is | * Examine the first PTE in the specified PTP. Abort if this PTE is | ||||
* either invalid, unused, or does not map the first 4KB physical page | * ineligible for promotion due to hardware errata, invalid, or does | ||||
* within a 2MB page. | * not map the first 4KB physical page within a 2MB page. | ||||
*/ | */ | ||||
firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); | firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); | ||||
newpde = *firstpte; | newpde = *firstpte; | ||||
if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || | if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) | ||||
!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, | return; | ||||
newpde))) { | if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { | ||||
counter_u64_add(pmap_pde_p_failures, 1); | counter_u64_add(pmap_pde_p_failures, 1); | ||||
CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" | CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" | ||||
" in pmap %p", va, pmap); | " in pmap %p", va, pmap); | ||||
return; | return; | ||||
} | } | ||||
/* | |||||
* Both here and in the below "for" loop, to allow for repromotion | |||||
* after MADV_FREE, conditionally write protect a clean PTE before | |||||
* possibly aborting the promotion due to other PTE attributes. Why? | |||||
* Suppose that MADV_FREE is applied to a part of a superpage, the | |||||
markj: I don't quite understand the problem being solved. Are we simply trying to minimize the window… | |||||
Done Inline ActionsSuppose that madvise(MADV_FREE) is applied to a portion of a superpage. Let's call that portion [S, E). The madvise(MADV_FREE) will demote the mapping, destroy the 4KB mapping at the end of [S, E), and remove PG_M and PG_A from the rest of the 4KB mappings. Later, imagine that mailloc() recycles the memory in [S, E), but the last 4KB page in that range is not the last to be rewritten, or simply accessed . In other words, there is a 4KB page, call it P, in [S, E), that is still writeable, but PG_M and PG_A are still clear. Previously, seeing that PG_A is clear on P during an attempted promotion when the last 4KB page in [S, E) is written, we would have aborted the promotion without write protecting P. Then, if and when P is finally rewritten, there won't be a page fault to trigger repromotion. alc: Suppose that madvise(MADV_FREE) is applied to a portion of a superpage. Let's call that… | |||||
Not Done Inline ActionsI must admit that this text is much more useful then the added code comment. kib: I must admit that this text is much more useful then the added code comment. | |||||
* address range [S, E). pmap_advise() will demote the superpage | |||||
* mapping, destroy the 4KB page mapping at the end of [S, E), and | |||||
* clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, | |||||
* imagine that the memory in [S, E) is recycled, but the last 4KB | |||||
* page in [S, E) is not the last to be rewritten, or simply accessed. | |||||
* In other words, there is still a 4KB page in [S, E), call it P, | |||||
* that is writeable but PG_M and PG_A are clear in P's PTE. Unless | |||||
* we write protect P before aborting the promotion, if and when P is | |||||
* finally rewritten, there won't be a page fault to trigger | |||||
* repromotion. | |||||
*/ | |||||
setpde: | setpde: | ||||
if ((newpde & (PG_M | PG_RW)) == PG_RW) { | if ((newpde & (PG_M | PG_RW)) == PG_RW) { | ||||
/* | /* | ||||
* When PG_M is already clear, PG_RW can be cleared without | * When PG_M is already clear, PG_RW can be cleared without | ||||
* a TLB invalidation. | * a TLB invalidation. | ||||
*/ | */ | ||||
if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) | if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) | ||||
goto setpde; | goto setpde; | ||||
newpde &= ~PG_RW; | newpde &= ~PG_RW; | ||||
} | } | ||||
if ((newpde & PG_A) == 0) { | |||||
counter_u64_add(pmap_pde_p_failures, 1); | |||||
CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" | |||||
" in pmap %p", va, pmap); | |||||
return; | |||||
} | |||||
/* | /* | ||||
* Examine each of the other PTEs in the specified PTP. Abort if this | * Examine each of the other PTEs in the specified PTP. Abort if this | ||||
* PTE maps an unexpected 4KB physical page or does not have identical | * PTE maps an unexpected 4KB physical page or does not have identical | ||||
* characteristics to the first PTE. | * characteristics to the first PTE. | ||||
*/ | */ | ||||
pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; | pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; | ||||
for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { | for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { | ||||
oldpte = *pte; | oldpte = *pte; | ||||
if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { | if ((oldpte & (PG_FRAME | PG_V)) != pa) { | ||||
counter_u64_add(pmap_pde_p_failures, 1); | counter_u64_add(pmap_pde_p_failures, 1); | ||||
CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" | CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" | ||||
" in pmap %p", va, pmap); | " in pmap %p", va, pmap); | ||||
return; | return; | ||||
} | } | ||||
setpte: | setpte: | ||||
if ((oldpte & (PG_M | PG_RW)) == PG_RW) { | if ((oldpte & (PG_M | PG_RW)) == PG_RW) { | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines |
I don't quite understand the problem being solved. Are we simply trying to minimize the window in which a concurrent access sets PG_A after pmap_promote_pde() has loaded a copy of the PTE? The use of the word "must" in the comment above makes me think I'm missing something.