diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1495,6 +1495,15 @@ pmap->pm_stats.resident_count += count; } +static __inline void +pmap_pt_page_count_pinit(pmap_t pmap, int count) +{ + KASSERT(pmap->pm_stats.resident_count + count >= 0, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->pm_stats.resident_count, count)); + pmap->pm_stats.resident_count += count; +} + static __inline void pmap_pt_page_count_adj(pmap_t pmap, int count) { @@ -4344,13 +4353,24 @@ vm_paddr_t pmltop_phys; int i; + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + /* - * Allocate the page directory page. Pass NULL instead of a pointer to - * the pmap here to avoid recording this page in the resident count, as - * optimizations in pmap_remove() depend on this. + * Allocate the page directory page. Pass NULL instead of a + * pointer to the pmap here to avoid calling + * pmap_resident_count_adj() through pmap_pt_page_count_adj(), + * since that requires pmap lock. Instead do the accounting + * manually. + * + * Note that final call to pmap_remove() optimization that + * checks for zero resident_count is basically disabled by + * accounting for top-level page. But the optimization was + * not effective since we started using non-managed mapping of + * the shared page. */ pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); + pmap_pt_page_count_pinit(pmap, 1); pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); @@ -4380,11 +4400,13 @@ pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { /* - * As with pmltop_pg, pass NULL instead of a pointer to - * the pmap to ensure that the PTI page isn't counted. + * As with pmltop_pg, pass NULL instead of a + * pointer to the pmap to ensure that the PTI + * page counted explicitly. */ pmltop_pgu = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_WAITOK); + pmap_pt_page_count_pinit(pmap, 1); pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(pmltop_pgu)); if (pmap_is_la57(pmap)) @@ -4407,7 +4429,6 @@ vm_radix_init(&pmap->pm_root); CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); - bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; @@ -4799,9 +4820,6 @@ vm_page_t m; int i; - KASSERT(pmap->pm_stats.resident_count == 0, - ("pmap_release: pmap %p resident count %ld != 0", - pmap, pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap %p has reserved page table page(s)", pmap)); @@ -4834,15 +4852,21 @@ } pmap_free_pt_page(NULL, m, true); + pmap_pt_page_count_pinit(pmap, -1); if (pmap->pm_pmltopu != NULL) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> pm_pmltopu)); pmap_free_pt_page(NULL, m, false); + pmap_pt_page_count_pinit(pmap, -1); } if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_fini(&pmap->pm_pkru); + + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap %p resident count %ld != 0", + pmap, pmap->pm_stats.resident_count)); } static int @@ -6254,9 +6278,14 @@ PG_V = pmap_valid_bit(pmap); /* + * If there are no resident pages besides the top level page + * table page(s), there is nothing to do. Kernel pmap always + * accounts whole preloaded area as resident, which makes its + * resident count > 2. * Perform an unsynchronized read. This is, however, safe. */ - if (pmap->pm_stats.resident_count == 0) + if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? + 1 : 0)) return; anyvalid = 0; diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1070,6 +1070,30 @@ umtx_exec(td->td_proc); } +/* + * This is an optimization which removes the unmanaged shared page + * mapping. In combination with pmap_remove_pages(), which cleans all + * managed mappings in the process' vmspace pmap, no work will be left + * for pmap_remove(min, max). + */ +void +exec_free_abi_mappings(struct proc *p) +{ + struct vmspace *vmspace; + struct sysentvec *sv; + + vmspace = p->p_vmspace; + if (refcount_load(&vmspace->vm_refcnt) != 1) + return; + + sv = p->p_sysent; + if (sv->sv_shared_page_obj == NULL) + return; + + pmap_remove(vmspace_pmap(vmspace), sv->sv_shared_page_base, + sv->sv_shared_page_base + sv->sv_shared_page_len); +} + /* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown @@ -1112,6 +1136,7 @@ vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { + exec_free_abi_mappings(p); shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -417,6 +417,7 @@ mtx_unlock(&ppeers_lock); } + exec_free_abi_mappings(p); vmspace_exit(td); (void)acct_process(td); diff --git a/sys/sys/sysent.h b/sys/sys/sysent.h --- a/sys/sys/sysent.h +++ b/sys/sys/sysent.h @@ -324,6 +324,7 @@ void exec_inittk(void); void exit_onexit(struct proc *p); +void exec_free_abi_mappings(struct proc *p); void exec_onexec_old(struct thread *td); #define INIT_SYSENTVEC(name, sv) \