Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -6763,19 +6763,36 @@ /* * Examine the first PTE in the specified PTP. Abort if this PTE is - * either invalid, unused, or does not map the first 4KB physical page - * within a 2MB page. + * ineligible for promotion due to hardware errata, invalid, or does + * not map the first 4KB physical page within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); newpde = *firstpte; - if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || - !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, - newpde))) { + if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) + return; + if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } + + /* + * Both here and in the below "for" loop, to allow for repromotion + * after MADV_FREE, conditionally write protect a clean PTE before + * possibly aborting the promotion due to other PTE attributes. Why? + * Suppose that MADV_FREE is applied to a part of a superpage, the + * address range [S, E). pmap_advise() will demote the superpage + * mapping, destroy the 4KB page mapping at the end of [S, E), and + * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, + * imagine that the memory in [S, E) is recycled, but the last 4KB + * page in [S, E) is not the last to be rewritten, or simply accessed. + * In other words, there is still a 4KB page in [S, E), call it P, + * that is writeable but PG_M and PG_A are clear in P's PTE. Unless + * we write protect P before aborting the promotion, if and when P is + * finally rewritten, there won't be a page fault to trigger + * repromotion. + */ setpde: if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* @@ -6786,16 +6803,22 @@ goto setpde; newpde &= ~PG_RW; } + if ((newpde & PG_A) == 0) { + counter_u64_add(pmap_pde_p_failures, 1); + CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ - pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; + pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { oldpte = *pte; - if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { + if ((oldpte & (PG_FRAME | PG_V)) != pa) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); Index: sys/arm64/arm64/pmap.c =================================================================== --- sys/arm64/arm64/pmap.c +++ sys/arm64/arm64/pmap.c @@ -3870,17 +3870,38 @@ PMAP_LOCK_ASSERT(pmap, MA_OWNED); PMAP_ASSERT_STAGE1(pmap); + /* + * Examine the first L3E in the specified PTP. Abort if this L3E is + * ineligible for promotion, invalid, or does not map the first 4KB + * physical page within a 2MB page. + */ firstl3 = (pt_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); newl2 = pmap_load(firstl3); - - if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF || - (newl2 & ATTR_SW_NO_PROMOTE) != 0) { + if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) + return; + if ((newl2 & ((~ATTR_MASK & L2_OFFSET) | ATTR_DESCR_MASK)) != L3_PAGE) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } + /* + * Both here and in the below "for" loop, to allow for repromotion + * after MADV_FREE, conditionally write protect a clean L3E before + * possibly aborting the promotion due to other L3E attributes. Why? + * Suppose that MADV_FREE is applied to a part of a superpage, the + * address range [S, E). pmap_advise() will demote the superpage + * mapping, destroy the 4KB page mapping at the end of [S, E), and + * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, + * imagine that the memory in [S, E) is recycled, but the last 4KB + * page in [S, E) is not the last to be rewritten, or simply accessed. + * In other words, there is still a 4KB page in [S, E), call it P, + * that is writeable but AP_RO is set and AF is clear in P's L3E. + * Unless we write protect P before aborting the promotion, if and + * when P is finally rewritten, there won't be a page fault to trigger + * repromotion. + */ setl2: if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { @@ -3892,10 +3913,27 @@ goto setl2; newl2 &= ~ATTR_SW_DBM; } + if ((newl2 & ATTR_AF) == 0) { + atomic_add_long(&pmap_l2_p_failures, 1); + CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } - pa = newl2 + L2_SIZE - PAGE_SIZE; + /* + * Examine each of the other L3Es in the specified PTP. Abort if this + * L3E maps an unexpected 4KB physical page or does not have identical + * characteristics to the first L3E. + */ + pa = (newl2 & (~ATTR_MASK | ATTR_DESCR_MASK)) + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); + if ((oldl3 & (~ATTR_MASK | ATTR_DESCR_MASK)) != pa) { + atomic_add_long(&pmap_l2_p_failures, 1); + CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" + " in pmap %p", va, pmap); + return; + } setl3: if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { @@ -3909,7 +3947,7 @@ goto setl3; oldl3 &= ~ATTR_SW_DBM; } - if (oldl3 != pa) { + if ((oldl3 & ATTR_MASK) != (newl2 & ATTR_MASK)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); @@ -3948,7 +3986,7 @@ atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, - pmap); + pmap); } #endif /* VM_NRESERVLEVEL > 0 */