Index: stable/11/sys/amd64/amd64/pmap.c =================================================================== --- stable/11/sys/amd64/amd64/pmap.c (revision 328385) +++ stable/11/sys/amd64/amd64/pmap.c (revision 328386) @@ -1,7541 +1,7541 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = X86_PG_G; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } extern struct pcpu __pcpu[]; #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { int i; uint64_t res; res = 0; CPU_FOREACH(i) { res += cpuid_to_pcpu[i]->pc_pm_save_cnt; } return (sysctl_handle_64(oidp, &res, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; static u_long pmap_invl_gen = 0; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; static bool pmap_not_in_di(void) { return (curthread->td_md.md_invl_gen.gen == 0); } #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_started(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finished(void) { struct pmap_invl_gen *invl_gen, *next; struct turnstile *ts; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_started")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) { turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); pmap_invl_gen = invl_gen->gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts, TS_SHARED_LOCK); } turnstile_chain_unlock(&invl_gen_ts); } else { next->gen = invl_gen->gen; } LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, "Number of times DI invalidation blocked pmap_remove_all/write"); #endif static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait(vm_page_t m) { struct turnstile *ts; u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { atomic_add_long(&invl_wait, 1); accounted = true; } #endif ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > pmap_invl_gen) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finished(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance * by using a large (2MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; newaddr = roundup2(addr, NBPDR); return (newaddr); } /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); return (PTmap + ((va >> PAGE_SHIFT) & mask)); } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); return (PDmap + ((va >> PDRSHIFT) & mask)); } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr, 1 << PDRSHIFT); pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe; pt_entry_t *pt_p; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) ndm1g = ptoa(Maxmem) >> PDPSHIFT; if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); /* Fill in the underlying page table pages */ /* Nominally read-only (but really R/W) from zero to physfree */ /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G; /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | PG_U; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | X86_PG_M | X86_PG_A; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | X86_PG_M | X86_PG_A; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; } } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte; int i; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* XXX do %cr0 as well */ load_cr4(rcr4() | CR4_PGE); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) load_cr4(rcr4() | CR4_SMEP); /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_cr3 = KPML4phys; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; virtual_avail = va; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { /* Check for INVPCID support */ invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } __pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1; __pcpu[0].pc_pcid_gen = 1; /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } else { pmap_pcid_enabled = 0; } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int error, i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, "1GB page mapping counters"); static u_long pmap_pdpe_demotions; SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, 0, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } bool pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask; u_int cpuid, i; if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) invlpg(va); else if (pmap_pcid_enabled) pmap->pm_pcids[cpuid].pm_gen = 0; if (pmap_pcid_enabled) { CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } mask = &pmap->pm_active; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask; vm_offset_t addr; u_int cpuid, i; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); cpuid = PCPU_GET(cpuid); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else if (pmap_pcid_enabled) { pmap->pm_pcids[cpuid].pm_gen = 0; } if (pmap_pcid_enabled) { CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } mask = &pmap->pm_active; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask; struct invpcid_descr d; u_int cpuid, i; if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { if (invpcid_works) { d.pcid = pmap->pm_pcids[cpuid].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { load_cr3(pmap->pm_cr3 | pmap->pm_pcids [PCPU_GET(cpuid)].pm_pcid); } } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[cpuid].pm_gen = 0; } if (pmap_pcid_enabled) { CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } mask = &pmap->pm_active; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, - smp_no_rendevous_barrier, pmap_update_pde_action, + smp_no_rendezvous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) invlpg(va); else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid); } } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + NBPDR). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + NBPDR), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; /* * Otherwise, do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { if (pmap_kextract(sva) == lapic_paddr) return; /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; int count; for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } atomic_subtract_int(&vm_cnt.v_wire_count, count); } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); pmap->pm_cr3 = KPML4phys; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } PCPU_SET(curpmap, kernel_pmap); pmap_activate(curthread); CPU_FILL(&kernel_pmap->pm_active); } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | PG_U; } for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | PG_U; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg; vm_paddr_t pml4phys; int i; /* * allocate the page directory page */ pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pml4phys = VM_PAGE_TO_PHYS(pml4pg); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = ~0; /* initialize to an invalid value */ if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. */ if ((pmap->pm_type = pm_type) == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); VM_WAIT; PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->wire_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->wire_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->wire_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) pmap_delayed_invl_finished(); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; bool start_di; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; /* * A delayed invalidation block should already be active if * pmap_advise() or pmap_remove() called this function by way * of pmap_demote_pde_locked(). */ start_di = pmap_not_in_di(); mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_started(); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_started(); mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } else if (start_di) pmap_delayed_invl_started(); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; int avail, free; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; int PG_PTE_CACHE; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed or the allocation of the new * page table page fails. If the 2MB page mapping belongs to * the direct map region of the kernel's address space, then * the page allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is * normal. Page table pages are preallocated for every other * part of the kernel address space, so the direct map region * is the only part of the kernel address space that must be * handled here. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap_resident_count_inc(pmap, 1); } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (va >= VM_MAXUSER_ADDRESS) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * Initialize the page table page. */ pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_G = pmap_global_bit(pmap); anyvalid = false; va = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } if ((*pte & PG_G) == 0) anyvalid = true; else if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { sva += PAGE_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_started(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) anyvalid = 1; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finished(); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; /* * Although this function delays and batches the invalidation * of stale TLB entries, it does not need to call * pmap_delayed_invl_started() and * pmap_delayed_invl_finished(), because it does not * ordinarily destroy mappings. Stale TLB entries from * protection-only changes need only be invalidated before the * pmap lock is released, because protection-only changes do * not destroy PV entries. Even operations that iterate over * a physical page's PV list of mappings, like * pmap_remove_write(), acquire the pmap lock for each * mapping. Consequently, for protection-only changes, the * pmap lock suffices to synchronize both page table and TLB * updates. * * This function only destroys a mapping if pmap_demote_pde() * fails. In that case, stale TLB entries are immediately * invalidated. */ PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); atomic_add_long(&pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); opa = origpte & PG_FRAME; if (opa != pa) { if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pmap_pvh_free(&om->md, pmap, va); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; oldpde = *pde; if ((oldpde & PG_V) != 0) { KASSERT(pdpg->wire_count > 1, ("pmap_enter_pde: pdpg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { pdpg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * The reference to the PD page that was acquired by * pmap_allocpde() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_started(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finished(); } pmap_free_zero_pages(&free); if (va >= VM_MAXUSER_ADDRESS) { mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt)) { /* * XXX Currently, this can't happen because * we do not perform pmap_enter(psind == 1) * on the kernel pmap. */ panic("pmap_enter_pde: trie insert failed"); } } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); atomic_add_long(&pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pt_entry_t *pte, PG_V; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pdpg = pmap_allocpde(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(addr)]; if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->wire_count--; KASSERT(pdpg->wire_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_started()/finished() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; pt_entry_t PG_A, PG_M, PG_V; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { pt_entry_t *src_pte, *dst_pte; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t srcptepaddr, *pde; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) { va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); if (dst_pdpg == NULL) break; pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); pde = &pde[pmap_pde_index(addr)]; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else dst_pdpg->wire_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; while (addr < va_next) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { if (dstmpte != NULL && dstmpte->pindex == pmap_pde_pindex(addr)) dstmpte->wire_count++; else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not * mapped, paging-structure * caches could nonetheless * have entries that refer to * the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_started() and * pmap_delayed_invl_finished(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); pmap_free_zero_pages(&free); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; boolean_t anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; pmap_delayed_invl_started(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pde_to_pte(pde, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, *pde, NULL, &lock); anychanged = TRUE; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finished(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { while (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~mask; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size < dmaplimit) { va = PHYS_TO_DMAP(pa); if (!pmap_change_attr(va, size, mode)) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + tmpsize, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pdpgpa; vm_page_t pdpg; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); pmap_pdpe_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) return (EINVAL); if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) return (EINVAL); if (*pde & PG_PS) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pdpe, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte, X86_PG_PTE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V)) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x", cpuid, pcid_next)); if (pcid_next == PMAP_PCID_OVERMAX) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; uint64_t cached, cr3; register_t rflags; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) return; cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif cr3 = rcr3(); if (pmap_pcid_enabled) { cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); /* * If the INVPCID instruction is not available, * invltlb_pcid_handler() is used for handle * invalidate_all IPI, which checks for curpmap == * smp_tlb_pmap. Below operations sequence has a * window where %CR3 is loaded with the new pmap's * PML4 address, but curpmap value is not yet updated. * This causes invltlb IPI handler, called between the * updates, to execute as NOP, which leaves stale TLB * entries. * * Note that the most typical use of * pmap_activate_sw(), from the context switch, is * immune to this race, because interrupts are * disabled (while the thread lock is owned), and IPI * happends after curpmap is updated. Protect other * callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap * assignment. */ if (!invpcid_works) rflags = intr_disable(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); if (cached) PCPU_INC(pm_save_cnt); } PCPU_SET(curpmap, pmap); if (!invpcid_works) intr_restore(rflags); } else if (cr3 != pmap->pm_cr3) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } #if VM_NRESERVLEVEL > 0 /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #endif #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (have_addr) { va = (vm_offset_t)addr; pmap = PCPU_GET(curpmap); /* XXX */ } else { db_printf("show pte addr\n"); return; } PG_V = pmap_valid_bit(pmap); pml4 = pmap_pml4e(pmap, va); db_printf("VA %#016lx pml4e %#016lx", va, *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); db_printf(" pdpe %#016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); db_printf(" pde %#016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); db_printf(" pte %#016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } #endif Index: stable/11/sys/amd64/include/counter.h =================================================================== --- stable/11/sys/amd64/include/counter.h (revision 328385) +++ stable/11/sys/amd64/include/counter.h (revision 328386) @@ -1,89 +1,89 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include extern struct pcpu __pcpu[1]; #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, i) counter_u64_add(c, i) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { __asm __volatile("addq\t%1,%%gs:(%0)" : : "r" ((char *)c - (char *)&__pcpu[0]), "ri" (inc) : "memory", "cc"); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: stable/11/sys/arm/arm/pmap-v6.c =================================================================== --- stable/11/sys/arm/arm/pmap-v6.c (revision 328385) +++ stable/11/sys/arm/arm/pmap-v6.c (revision 328386) @@ -1,6887 +1,6887 @@ /*- * Copyright (c) 1991 Regents of the University of California. * Copyright (c) 1994 John S. Dyson * Copyright (c) 1994 David Greenman * Copyright (c) 2005-2010 Alan L. Cox * Copyright (c) 2014-2016 Svatopluk Kraus * Copyright (c) 2014-2016 Michal Meloun * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include "opt_pmap.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #ifndef DIAGNOSTIC #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PMAP_DEBUG static void pmap_zero_page_check(vm_page_t m); void pmap_debug(int level); int pmap_pid_dump(int pid); #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 1; #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #endif /* PMAP_DEBUG */ /* * Level 2 page tables map definion ('max' is excluded). */ #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define UPT2V_MAX_ADDRESS \ ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) /* * Promotion to a 1MB (PTE1) page mapping requires that the corresponding * 4KB (PTE2) page mappings have identical settings for the following fields: */ #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ PTE2_ATTR_MASK) #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ PTE1_ATTR_MASK) #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ (((l2_attr) & L2_C) ? L1_S_C : 0) | \ (((l2_attr) & L2_B) ? L1_S_B : 0) | \ (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ (((l2_attr) & PTE2_W) ? PTE1_W : 0)) #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ (((l1_attr) & L1_S_C) ? L2_C : 0) | \ (((l1_attr) & L1_S_B) ? L2_B : 0) | \ (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ (((l1_attr) & PTE1_W) ? PTE2_W : 0)) /* * PTE2 descriptors creation macros. */ #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * The boot_pt1 is used temporary in very early boot stage as L1 page table. * We can init many things with no memory allocation thanks to its static * allocation and this brings two main advantages: * (1) other cores can be started very simply, * (2) various boot loaders can be supported as its arguments can be processed * in virtual address space and can be moved to safe location before * first allocation happened. * Only disadvantage is that boot_pt1 is used only in very early boot stage. * However, the table is uninitialized and so lays in bss. Therefore kernel * image size is not influenced. * * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and * CPU suspend/resume game. */ extern pt1_entry_t boot_pt1[]; vm_paddr_t base_pt1; pt1_entry_t *kern_pt1; pt2_entry_t *kern_pt2tab; pt2_entry_t *PT2MAP; static uint32_t ttb_flags; static vm_memattr_t pt_memattr; ttb_entry_t pmap_kern_ttb; struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static vm_offset_t kernel_vm_end_new; vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; vm_offset_t vm_max_kernel_address; vm_paddr_t kernel_l1pa; static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ vm_paddr_t first_managed_pa; #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) /* * All those kernel PT submaps that BSD is so fond of */ static pt2_entry_t *CMAP3; static caddr_t CADDR3; caddr_t _tmppt = 0; struct msgbuf *msgbufp = NULL; /* XXX move it to machdep.c */ /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt2_entry_t *PMAP1 = NULL, *PMAP2; static pt2_entry_t *PADDR1 = NULL, *PADDR2; #ifdef DDB static pt2_entry_t *PMAP3; static pt2_entry_t *PADDR3; static int PMAP3cpu __unused; /* for SMP only */ #endif #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte2_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte2_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte2_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static __inline void pt2_wirecount_init(vm_page_t m); static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va); void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); /* * Function to set the debug level of the pmap code. */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ /* * This table must corespond with memory attribute configuration in vm.h. * First entry is used for normal system mapping. * * Device memory is always marked as shared. * Normal memory is shared only in SMP . * Not outer shareable bits are not used yet. * Class 6 cannot be used on ARM11. */ #define TEXDEF_TYPE_SHIFT 0 #define TEXDEF_TYPE_MASK 0x3 #define TEXDEF_INNER_SHIFT 2 #define TEXDEF_INNER_MASK 0x3 #define TEXDEF_OUTER_SHIFT 4 #define TEXDEF_OUTER_MASK 0x3 #define TEXDEF_NOS_SHIFT 6 #define TEXDEF_NOS_MASK 0x1 #define TEX(t, i, o, s) \ ((t) << TEXDEF_TYPE_SHIFT) | \ ((i) << TEXDEF_INNER_SHIFT) | \ ((o) << TEXDEF_OUTER_SHIFT | \ ((s) << TEXDEF_NOS_SHIFT)) static uint32_t tex_class[8] = { /* type inner cache outer cache */ TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ }; #undef TEX static uint32_t pte2_attr_tab[8] = { PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 0, /* 5 - NOT USED YET */ 0, /* 6 - NOT USED YET */ 0 /* 7 - NOT USED YET */ }; CTASSERT(VM_MEMATTR_WB_WA == 0); CTASSERT(VM_MEMATTR_NOCACHE == 1); CTASSERT(VM_MEMATTR_DEVICE == 2); CTASSERT(VM_MEMATTR_SO == 3); CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); static inline uint32_t vm_memattr_to_pte2(vm_memattr_t ma) { KASSERT((u_int)ma < 5, ("%s: bad vm_memattr_t %d", __func__, ma)); return (pte2_attr_tab[(u_int)ma]); } static inline uint32_t vm_page_pte2_attr(vm_page_t m) { return (vm_memattr_to_pte2(m->md.pat_mode)); } /* * Convert TEX definition entry to TTB flags. */ static uint32_t encode_ttb_flags(int idx) { uint32_t inner, outer, nos, reg; inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; reg = nos << 5; reg |= outer << 3; if (cpuinfo.coherent_walk) reg |= (inner & 0x1) << 6; reg |= (inner & 0x2) >> 1; #ifdef SMP reg |= 1 << 1; #endif return reg; } /* * Set TEX remapping registers in current CPU. */ void pmap_set_tex(void) { uint32_t prrr, nmrr; uint32_t type, inner, outer, nos; int i; #ifdef PMAP_PTE_NOCACHE /* XXX fixme */ if (cpuinfo.coherent_walk) { pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); } else { pt_memattr = VM_MEMATTR_NOCACHE; ttb_flags = encode_ttb_flags(1); } #else pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); #endif prrr = 0; nmrr = 0; /* Build remapping register from TEX classes. */ for (i = 0; i < 8; i++) { type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & TEXDEF_TYPE_MASK; inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; prrr |= type << (i * 2); prrr |= nos << (i + 24); nmrr |= inner << (i * 2); nmrr |= outer << (i * 2 + 16); } /* Add shareable bits for device memory. */ prrr |= PRRR_DS0 | PRRR_DS1; /* Add shareable bits for normal memory in SMP case. */ #ifdef SMP prrr |= PRRR_NS1; #endif cp15_prrr_set(prrr); cp15_nmrr_set(nmrr); /* Caches are disabled, so full TLB flush should be enough. */ tlb_flush_all_local(); } /* * Remap one vm_meattr class to another one. This can be useful as * workaround for SOC errata, e.g. if devices must be accessed using * SO memory class. * * !!! Please note that this function is absolutely last resort thing. * It should not be used under normal circumstances. !!! * * Usage rules: * - it shall be called after pmap_bootstrap_prepare() and before * cpu_mp_start() (thus only on boot CPU). In practice, it's expected * to be called from platform_attach() or platform_late_init(). * * - if remapping doesn't change caching mode, or until uncached class * is remapped to any kind of cached one, then no other restriction exists. * * - if pmap_remap_vm_attr() changes caching mode, but both (original and * remapped) remain cached, then caller is resposible for calling * of dcache_wbinv_poc_all(). * * - remapping of any kind of cached class to uncached is not permitted. */ void pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) { int old_idx, new_idx; /* Map VM memattrs to indexes to tex_class table. */ old_idx = pte2_attr_tab[(int)old_attr]; new_idx = pte2_attr_tab[(int)new_attr]; /* Replace TEX attribute and apply it. */ tex_class[old_idx] = tex_class[new_idx]; pmap_set_tex(); } /* * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, * KERNBASE is mapped by first L2 page table in L2 page table page. It * meets same constrain due to PT2MAP being placed just under KERNBASE. */ CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); /* * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. * For now, anyhow, the following check must be fulfilled. */ CTASSERT(PAGE_SIZE == PTE2_SIZE); /* * We don't want to mess up MI code with all MMU and PMAP definitions, * so some things, which depend on other ones, are defined independently. * Now, it is time to check that we don't screw up something. */ CTASSERT(PDRSHIFT == PTE1_SHIFT); /* * Check L1 and L2 page table entries definitions consistency. */ CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); /* * Check L2 page tables page consistency. */ CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); /* * Check PT2TAB consistency. * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. * This should be done without remainder. */ CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); /* * A PT2MAP magic. * * All level 2 page tables (PT2s) are mapped continuously and accordingly * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page * must be used together, but not necessary at once. The first PT2 in a page * must map things on correctly aligned address and the others must follow * in right order. */ #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) /* * Check PT2TAB consistency. * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. * The both should be done without remainder. */ CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); /* * The implementation was made general, however, with the assumption * bellow in mind. In case of another value of NPG_IN_PT2TAB, * the code should be once more rechecked. */ CTASSERT(NPG_IN_PT2TAB == 1); /* * Get offset of PT2 in a page * associated with given PT1 index. */ static __inline u_int page_pt2off(u_int pt1_idx) { return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); } /* * Get physical address of PT2 * associated with given PT2s page and PT1 index. */ static __inline vm_paddr_t page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) { return (pgpa + page_pt2off(pt1_idx)); } /* * Get first entry of PT2 * associated with given PT2s page and PT1 index. */ static __inline pt2_entry_t * page_pt2(vm_offset_t pgva, u_int pt1_idx) { return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); } /* * Get virtual address of PT2s page (mapped in PT2MAP) * which holds PT2 which holds entry which maps given virtual address. */ static __inline vm_offset_t pt2map_pt2pg(vm_offset_t va) { va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); return ((vm_offset_t)pt2map_entry(va)); } /***************************************************************************** * * THREE pmap initialization milestones exist: * * locore.S * -> fundamental init (including MMU) in ASM * * initarm() * -> fundamental init continues in C * -> first available physical address is known * * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) * -> basic (safe) interface for physical address allocation is made * -> basic (safe) interface for virtual mapping is made * -> limited not SMP coherent work is possible * * -> more fundamental init continues in C * -> locks and some more things are available * -> all fundamental allocations and mappings are done * * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) * -> phys_avail[] and virtual_avail is set * -> control is passed to vm subsystem * -> physical and virtual address allocation are off limit * -> low level mapping functions, some SMP coherent, * are available, which cannot be used before vm subsystem * is being inited * * mi_startup() * -> vm subsystem is being inited * * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) * -> pmap is fully inited * *****************************************************************************/ /***************************************************************************** * * PMAP first stage initialization and utility functions * for pre-bootstrap epoch. * * After pmap_bootstrap_prepare() is called, the following functions * can be used: * * (1) strictly only for this stage functions for physical page allocations, * virtual space allocations, and mappings: * * vm_paddr_t pmap_preboot_get_pages(u_int num); * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); * vm_offset_t pmap_preboot_reserve_pages(u_int num); * vm_offset_t pmap_preboot_get_vpages(u_int num); * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, * vm_prot_t prot, vm_memattr_t attr); * * (2) for all stages: * * vm_paddr_t pmap_kextract(vm_offset_t va); * * NOTE: This is not SMP coherent stage. * *****************************************************************************/ #define KERNEL_P2V(pa) \ ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) #define KERNEL_V2P(va) \ ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) static vm_paddr_t last_paddr; /* * Pre-bootstrap epoch page allocator. */ vm_paddr_t pmap_preboot_get_pages(u_int num) { vm_paddr_t ret; ret = last_paddr; last_paddr += num * PAGE_SIZE; return (ret); } /* * The fundamental initialization of PMAP stuff. * * Some things already happened in locore.S and some things could happen * before pmap_bootstrap_prepare() is called, so let's recall what is done: * 1. Caches are disabled. * 2. We are running on virtual addresses already with 'boot_pt1' * as L1 page table. * 3. So far, all virtual addresses can be converted to physical ones and * vice versa by the following macros: * KERNEL_P2V(pa) .... physical to virtual ones, * KERNEL_V2P(va) .... virtual to physical ones. * * What is done herein: * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. * 2. PT2MAP magic is brought to live. * 3. Basic preboot functions for page allocations and mappings can be used. * 4. Everything is prepared for L1 cache enabling. * * Variations: * 1. To use second TTB register, so kernel and users page tables will be * separated. This way process forking - pmap_pinit() - could be faster, * it saves physical pages and KVA per a process, and it's simple change. * However, it will lead, due to hardware matter, to the following: * (a) 2G space for kernel and 2G space for users. * (b) 1G space for kernel in low addresses and 3G for users above it. * A question is: Is the case (b) really an option? Note that case (b) * does save neither physical memory and KVA. */ void pmap_bootstrap_prepare(vm_paddr_t last) { vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; vm_offset_t pt2pg_va; pt1_entry_t *pte1p; pt2_entry_t *pte2p; u_int i; uint32_t actlr_mask, actlr_set, l1_attr; /* * Now, we are going to make real kernel mapping. Note that we are * already running on some mapping made in locore.S and we expect * that it's large enough to ensure nofault access to physical memory * allocated herein before switch. * * As kernel image and everything needed before are and will be mapped * by section mappings, we align last physical address to PTE1_SIZE. */ last_paddr = pte1_roundup(last); /* * Allocate and zero page(s) for kernel L1 page table. * * Note that it's first allocation on space which was PTE1_SIZE * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. */ base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); bzero((void*)kern_pt1, NB_IN_PT1); pte1_sync_range(kern_pt1, NB_IN_PT1); /* Allocate and zero page(s) for kernel PT2TAB. */ pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); bzero(kern_pt2tab, NB_IN_PT2TAB); pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); /* Allocate and zero page(s) for kernel L2 page tables. */ pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); pt2pg_va = KERNEL_P2V(pt2pg_pa); size = NKPT2PG * PAGE_SIZE; bzero((void*)pt2pg_va, size); pte2_sync_range((pt2_entry_t *)pt2pg_va, size); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated pages for kernel L2 page tables so that vm_page * structures representing these pages will be created. The vm_page * structures are required for promotion of the corresponding kernel * virtual addresses to section mappings. */ vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); /* * Insert allocated L2 page table pages to PT2TAB and make * link to all PT2s in L1 page table. See how kernel_vm_end * is initialized. * * We play simple and safe. So every KVA will have underlaying * L2 page table, even kernel image mapped by sections. */ pte2p = kern_pt2tab_entry(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) pt2tab_store(pte2p++, PTE2_KPT(pa)); pte1p = kern_pte1(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) pte1_store(pte1p++, PTE1_LINK(pa)); /* Make section mappings for kernel. */ l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); pte1p = kern_pte1(KERNBASE); for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); /* * Get free and aligned space for PT2MAP and make L1 page table links * to L2 page tables held in PT2TAB. * * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus * each entry in PT2TAB maps all PT2s in a page. This implies that * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. */ PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); pte1p = kern_pte1((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. * Each pmap will hold own PT2TAB, so the mapping should be not global. */ pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* * Choose correct L2 page table and make mappings for allocations * made herein which replaces temporary locore.S mappings after a while. * Note that PT2MAP cannot be used until we switch to kern_pt1. * * Note, that these allocations started aligned on 1M section and * kernel PT1 was allocated first. Making of mappings must follow * order of physical allocations as we've used KERNEL_P2V() macro * for virtual addresses resolution. */ pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); /* Make mapping for kernel L1 page table. */ for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Make mapping for kernel PT2TAB. */ for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ pmap_kern_ttb = base_pt1 | ttb_flags; cpuinfo_get_actlr_modifier(&actlr_mask, &actlr_set); reinit_mmu(pmap_kern_ttb, actlr_mask, actlr_set); /* * Initialize the first available KVA. As kernel image is mapped by * sections, we are leaving some gap behind. */ virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; } /* * Setup L2 page table page for given KVA. * Used in pre-bootstrap epoch. * * Note that we have allocated NKPT2PG pages for L2 page tables in advance * and used them for mapping KVA starting from KERNBASE. However, this is not * enough. Vectors and devices need L2 page tables too. Note that they are * even above VM_MAX_KERNEL_ADDRESS. */ static __inline vm_paddr_t pmap_preboot_pt2pg_setup(vm_offset_t va) { pt2_entry_t *pte2p, pte2; vm_paddr_t pt2pg_pa; /* Get associated entry in PT2TAB. */ pte2p = kern_pt2tab_entry(va); /* Just return, if PT2s page exists already. */ pte2 = pt2tab_load(pte2p); if (pte2_is_valid(pte2)) return (pte2_pa(pte2)); KASSERT(va >= VM_MAX_KERNEL_ADDRESS, ("%s: NKPT2PG too small", __func__)); /* * Allocate page for PT2s and insert it to PT2TAB. * In other words, map it into PT2MAP space. */ pt2pg_pa = pmap_preboot_get_pages(1); pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); /* Zero all PT2s in allocated page. */ bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); return (pt2pg_pa); } /* * Setup L2 page table for given KVA. * Used in pre-bootstrap epoch. */ static void pmap_preboot_pt2_setup(vm_offset_t va) { pt1_entry_t *pte1p; vm_paddr_t pt2pg_pa, pt2_pa; /* Setup PT2's page. */ pt2pg_pa = pmap_preboot_pt2pg_setup(va); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); /* Insert PT2 to PT1. */ pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } /* * Get L2 page entry associated with given KVA. * Used in pre-bootstrap epoch. */ static __inline pt2_entry_t* pmap_preboot_vtopte2(vm_offset_t va) { pt1_entry_t *pte1p; /* Setup PT2 if needed. */ pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ pmap_preboot_pt2_setup(va); return (pt2map_entry(va)); } /* * Pre-bootstrap epoch page(s) mapping(s). */ void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) { u_int i; pt2_entry_t *pte2p; /* Map all the pages. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KRW(pa)); va += PAGE_SIZE; pa += PAGE_SIZE; } } /* * Pre-bootstrap epoch virtual space alocator. */ vm_offset_t pmap_preboot_reserve_pages(u_int num) { u_int i; vm_offset_t start, va; pt2_entry_t *pte2p; /* Allocate virtual space. */ start = va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Zero the mapping. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, 0); va += PAGE_SIZE; } return (start); } /* * Pre-bootstrap epoch page(s) allocation and mapping(s). */ vm_offset_t pmap_preboot_get_vpages(u_int num) { vm_paddr_t pa; vm_offset_t va; /* Allocate physical page(s). */ pa = pmap_preboot_get_pages(num); /* Allocate virtual space. */ va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Map and zero all. */ pmap_preboot_map_pages(pa, va, num); bzero((void *)va, num * PAGE_SIZE); return (va); } /* * Pre-bootstrap epoch page mapping(s) with attributes. */ void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, vm_prot_t prot, vm_memattr_t attr) { u_int num; u_int l1_attr, l1_prot, l2_prot, l2_attr; pt1_entry_t *pte1p; pt2_entry_t *pte2p; l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l2_attr = vm_memattr_to_pte2(attr); l1_prot = ATTR_TO_L1(l2_prot); l1_attr = ATTR_TO_L1(l2_attr); /* Map all the pages. */ num = round_page(size); while (num > 0) { if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); va += PTE1_SIZE; pa += PTE1_SIZE; num -= PTE1_SIZE; } else { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); va += PAGE_SIZE; pa += PAGE_SIZE; num -= PAGE_SIZE; } } } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". */ vm_paddr_t pmap_kextract(vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); } else if (pte1_is_link(pte1)) { /* * We should beware of concurrent promotion that changes * pte1 at this point. However, it's not a problem as PT2 * page is preserved by promotion in PT2TAB. So even if * it happens, using of PT2MAP is still safe. * * QQQ: However, concurrent removing is a problem which * ends in abort on PT2MAP space. Locking must be used * to deal with this. */ pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2) | (va & PTE2_OFFSET); } else { panic("%s: va %#x pte1 %#x", __func__, va, pte1); } return (pa); } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". Also * return L2 page table entry which maps the address. * * This is only intended to be used for panic dumps. */ vm_paddr_t pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2); } else { pte2 = 0; pa = 0; } if (pte2p != NULL) *pte2p = pte2; return (pa); } /***************************************************************************** * * PMAP second stage initialization and utility functions * for bootstrap epoch. * * After pmap_bootstrap() is called, the following functions for * mappings can be used: * * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); * void pmap_kremove(vm_offset_t va); * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, * int prot); * * NOTE: This is not SMP coherent stage. And physical page allocation is not * allowed during this stage. * *****************************************************************************/ /* * Initialize kernel PMAP locks and lists, kernel_pmap itself, and * reserve various virtual spaces for temporary mappings. */ void pmap_bootstrap(vm_offset_t firstaddr) { pt2_entry_t *unused __unused; struct pcpu *pc; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ kernel_pmap->pm_pt1 = kern_pt1; kernel_pmap->pm_pt2tab = kern_pt2tab; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) do { \ v = (c)pmap_preboot_reserve_pages(n); \ p = pt2map_entry((vm_offset_t)v); \ } while (0) /* * Local CMAP1/CMAP2 are used for zeroing and copying pages. * Local CMAP2 is also used for data cache cleaning. * Global CMAP3 is used for the idle process page zeroing. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); /* * _tmppt is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, _tmppt, 1); /* * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), * respectively. PADDR3 is used by pmap_pte2_ddb(). */ SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); #ifdef DDB SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); #endif mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); /* * Note that in very short time in initarm(), we are going to * initialize phys_avail[] array and no further page allocation * can happen after that until vm subsystem will be initialized. */ kernel_vm_end_new = kernel_vm_end; virtual_end = vm_max_kernel_address; } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap1_addr != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap1_pte2p = pt2map_entry(pages); pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); pc->pc_cmap1_addr = (caddr_t)pages; pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * The function can already be use in second initialization stage. * As such, the function DOES NOT call pmap_growkernel() where PT2 * allocation can happen. So if used, be sure that PT2 for given * virtual address is allocated already! * * Add a wired page to the kva. * Note: not SMP coherent. */ static __inline void pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, uint32_t attr) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ /* * This is a very low level function, so PT2 and particularly * PT2PG associated with given virtual address must be already * allocated. It's a pain mainly during pmap initialization * stage. However, called after pmap initialization with * virtual address not under kernel_vm_end will lead to * the same misery. */ if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) panic("%s: kernel PT2 not allocated!", __func__); } pte2p = pt2map_entry(va); pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); } PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (pte1_is_section(pte1_load(pte1p))) { pte1_clear(pte1p); } else { pte2p = pt2map_entry(va); pte2_clear(pte2p); } } /* * Share new kernel PT2PG with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) { pmap_t pmap; pt2_entry_t *pte2p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, npte2); } mtx_unlock_spin(&allpmaps_lock); } /* * Share new kernel PTE1 with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } mtx_unlock_spin(&allpmaps_lock); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * NOTE: Read the comments above pmap_kenter_prot_attr() as * the function is used herein! */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t pte1_offset; pt1_entry_t npte1; uint32_t l1prot, l2prot; uint32_t l1attr, l2attr; PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," " prot = %d\n", __func__, *virt, start, end, end - start, prot)); l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l1prot = ATTR_TO_L1(l2prot); l2attr = PTE2_ATTR_DEFAULT; l1attr = ATTR_TO_L1(l2attr); va = *virt; /* * Does the physical address range's size and alignment permit at * least one section mapping to be created? */ pte1_offset = start & PTE1_OFFSET; if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= PTE1_SIZE) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of section mappings. */ if ((va & PTE1_OFFSET) < pte1_offset) va = pte1_trunc(va) + pte1_offset; else if ((va & PTE1_OFFSET) > pte1_offset) va = pte1_roundup(va) + pte1_offset; } sva = va; while (start < end) { if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { KASSERT((va & PTE1_OFFSET) == 0, ("%s: misaligned va %#x", __func__, va)); npte1 = PTE1_KERN(start, l1prot, l1attr); pmap_kenter_pte1(va, npte1); va += PTE1_SIZE; start += PTE1_SIZE; } else { pmap_kenter_prot_attr(va, start, l2prot, l2attr); va += PAGE_SIZE; start += PAGE_SIZE; } } tlb_flush_range(sva, va - sva); *virt = va; return (sva); } /* * Make a temporary mapping for a physical address. * This is only intended to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); tlb_flush_local(va); return ((void *)crashdumpmap); } /************************************* * * TLB & cache maintenance routines. * *************************************/ /* * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_tlb_flush(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush(va); } PMAP_INLINE void pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush_range(sva, size); } /* * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PTE2_* bits * are ever set, PTE2_V in particular. * - Assumes we can write to pte2s without pte2_store() atomic ops. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PTE2_V. * - Assumes a vm_offset_t will fit in a pte2 (true for arm). * Because PTE2_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_pte2list_alloc(vm_offset_t *head) { pt2_entry_t *pte2p; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte2p = pt2map_entry(va); *head = *pte2p; if (*head & PTE2_V) panic("%s: va with PTE2_V set!", __func__); *pte2p = 0; return (va); } static void pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) { pt2_entry_t *pte2p; if (va & PTE2_V) panic("%s: freeing va with PTE2_V set!", __func__); pte2p = pt2map_entry(va); *pte2p = *head; /* virtual! PTE2_V is 0 though */ *head = va; } static void pmap_pte2list_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_pte2list_free(head, va); } } /***************************************************************************** * * PMAP third and final stage initialization. * * After pmap_init() is called, PMAP subsystem is fully initialized. * *****************************************************************************/ SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static u_long nkpt2pg = NKPT2PG; SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); static int sp_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &sp_enabled, 0, "Are large page mappings enabled?"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, "1MB page mapping counters"); static u_long pmap_pte1_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pte1_demotions, 0, "1MB page demotions"); static u_long pmap_pte1_mappings; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pte1_mappings, 0, "1MB page mappings"); static u_long pmap_pte1_p_failures; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pte1_p_failures, 0, "1MB page promotion failures"); static u_long pmap_pte1_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pte1_promotions, 0, "1MB page promotions"); static u_long pmap_pte1_kern_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); static u_long pmap_pte1_kern_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); static __inline ttb_entry_t pmap_ttb_get(pmap_t pmap) { return (vtophys(pmap->pm_pt1) | ttb_flags); } /* * Initialize a vm_page's machine-dependent fields. * * Variations: * 1. Pages for L2 page tables are always not managed. So, pv_list and * pt2_wirecount can share same physical space. However, proper * initialization on a page alloc for page tables and reinitialization * on the page free must be ensured. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); pt2_wirecount_init(m); m->md.pat_mode = VM_MEMATTR_DEFAULT; } /* * Virtualization for faster way how to zero whole page. */ static __inline void pagezero(void *page) { bzero(page, PAGE_SIZE); } /* * Zero L2 page table page. * Use same KVA as in pmap_zero_page(). */ static __inline vm_paddr_t pmap_pt2pg_zero(vm_page_t m) { pt2_entry_t *cmap2_pte2p; vm_paddr_t pa; struct pcpu *pc; pa = VM_PAGE_TO_PHYS(m); /* * XXX: For now, we map whole page even if it's already zero, * to sync it even if the sync is only DSB. */ sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); /* Even VM_ALLOC_ZERO request is only advisory. */ if ((m->flags & PG_ZERO) == 0) pagezero(pc->pc_cmap2_addr); pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); return (pa); } /* * Init just allocated page as L2 page table(s) holder * and return its physical address. */ static __inline vm_paddr_t pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_paddr_t pa; pt2_entry_t *pte2p; /* Check page attributes. */ if (m->md.pat_mode != pt_memattr) pmap_page_set_memattr(m, pt_memattr); /* Zero page and init wire counts. */ pa = pmap_pt2pg_zero(m); pt2_wirecount_init(m); /* * Map page to PT2MAP address space for given pmap. * Note that PT2MAP space is shared with all pmaps. */ if (pmap == kernel_pmap) pmap_kenter_pt2tab(va, PTE2_KPT(pa)); else { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, PTE2_KPT_NG(pa)); } return (pa); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; pt2_entry_t *pte2p, pte2; u_int i, pte1_idx, pv_npg; PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); /* * Initialize the vm page array entries for kernel pmap's * L2 page table pages allocated in advance. */ pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { vm_paddr_t pa; vm_page_t m; pte2 = pte2_load(pte2p); KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: L2 page table page is out of range", __func__)); m->pindex = pte1_idx; m->phys_addr = pa; pte1_idx += NPT2_IN_PG; } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); if (sp_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("%s: can't assign to pagesizes[1]", __func__)); pagesizes[1] = PTE1_SIZE; } /* * Calculate the size of the pv head table for sections. * Handle the possibility that "vm_phys_segs[...].end" is zero. * Note that the table is only for sections which could be promoted. */ first_managed_pa = pte1_trunc(vm_phys_segs[0].start); pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) - first_managed_pa) / PTE1_SIZE + 1; /* * Allocate memory for the pv head table for sections. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("%s: not enough kvm for pv chunks", __func__); pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { u_int anychanged; pt2_entry_t *epte2p, *pte2p, pte2; vm_page_t m; vm_paddr_t pa; anychanged = 0; pte2p = pt2map_entry(sva); epte2p = pte2p + count; while (pte2p < epte2p) { m = *ma++; pa = VM_PAGE_TO_PHYS(m); pte2 = pte2_load(pte2p); if ((pte2_pa(pte2) != pa) || (pte2_attr(pte2) != vm_page_pte2_attr(m))) { anychanged++; pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); } pte2p++; } if (__predict_false(anychanged)) tlb_flush_range(sva, count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } tlb_flush_range(sva, va - sva); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); } /* * If the given pmap is not the current or kernel pmap, the returned * pte2 must be released by passing it to pmap_pte2_release(). */ static pt2_entry_t * pmap_pte2(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Releases a pte2 that was obtained from pmap_pte2(). * Be prepared for the pte2p being NULL. */ static __inline void pmap_pte2_release(pt2_entry_t *pte2p) { if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { mtx_unlock(&PMAP2mutex); } } /* * Super fast pmap_pte2 routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire tlb flush for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt2_entry_t * pmap_pte2_quick(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t *pte2p; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) pa = pte1_pa(pte1) | (va & PTE1_OFFSET); else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); pmap_pte2_release(pte2p); } else pa = 0; PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa, lockpa; pt1_entry_t pte1; pt2_entry_t pte2, *pte2p; vm_page_t m; lockpa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) { if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) goto retry; m = PHYS_TO_VM_PAGE(pa); vm_page_hold(m); } } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (pte2_is_valid(pte2) && (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { pa = pte2_pa(pte2); if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) goto retry; m = PHYS_TO_VM_PAGE(pa); vm_page_hold(m); } } PA_UNLOCK_COND(lockpa); PMAP_UNLOCK(pmap); return (m); } /* * Grow the number of kernel L2 page table entries, if needed. */ void pmap_growkernel(vm_offset_t addr) { vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pt1_entry_t pte1; pt2_entry_t pte2; PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); /* * All the time kernel_vm_end is first KVA for which underlying * L2 page table is either not allocated or linked from L1 page table * (not considering sections). Except for two possible cases: * * (1) in the very beginning as long as pmap_growkernel() was * not called, it could be first unused KVA (which is not * rounded up to PTE1_SIZE), * * (2) when all KVA space is mapped and kernel_map->max_offset * address is not rounded up to PTE1_SIZE. (For example, * it could be 0xFFFFFFFF.) */ kernel_vm_end = pte1_roundup(kernel_vm_end); mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, PTE1_SIZE); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pte1 = pte1_load(kern_pte1(kernel_vm_end)); if (pte1_is_valid(pte1)) { kernel_vm_end += PTE1_SIZE; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } /* * kernel_vm_end_new is used in pmap_pinit() when kernel * mappings are entered to new pmap all at once to avoid race * between pmap_kenter_pte1() and kernel_vm_end increase. * The same aplies to pmap_kenter_pt2tab(). */ kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into kernel PT2TAB. */ m = vm_page_alloc(NULL, pte1_index(kernel_vm_end) & ~PT2PG_MASK, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) panic("%s: no memory to grow kernel", __func__); /* * QQQ: To link all new L2 page tables from L1 page * table now and so pmap_kenter_pte1() them * at once together with pmap_kenter_pt2tab() * could be nice speed up. However, * pmap_growkernel() does not happen so often... * QQQ: The other TTBR is another option. */ pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, m); } else pt2pg_pa = pte2_pa(pte2); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); kernel_vm_end = kernel_vm_end_new; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = vm_max_kernel_address - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = vm_max_kernel_address - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /*********************************************** * * Pmap allocation/deallocation routines. * ***********************************************/ /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); PMAP_LOCK_INIT(pmap); /* * Kernel page table directory and pmap stuff around is already * initialized, we are using it right now and here. So, finish * only PMAP structures initialization for process0 ... * * Since the L1 page table and PT2TAB is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pt1 = kern_pt1; pmap->pm_pt2tab = kern_pt2tab; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); CPU_SET(0, &pmap->pm_active); } static __inline void pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pte1_index(sva); count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); bcopy(spte1p + idx, dpte1p + idx, count); } static __inline void pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pt2tab_index(sva); count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); bcopy(spte2p + idx, dpte2p + idx, count); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; vm_paddr_t pa, pt2tab_pa; u_int i; PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, pmap->pm_pt1)); /* * No need to allocate L2 page table space yet but we do need * a valid L1 page table and PT2TAB table. * * Install shared kernel mappings to these tables. It's a little * tricky as some parts of KVA are reserved for vectors, devices, * and whatever else. These parts are supposed to be above * vm_max_kernel_address. Thus two regions should be installed: * * (1) . * * QQQ: The second region should be stable enough to be installed * only once in time when the tables are allocated. * QQQ: Maybe copy of both regions at once could be faster ... * QQQ: Maybe the other TTBR is an option. * * Finally, install own PT2TAB table to these tables. */ if (pmap->pm_pt1 == NULL) { pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena, NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); if (pmap->pm_pt1 == NULL) return (0); } if (pmap->pm_pt2tab == NULL) { /* * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page * only, what should be the only size for 32 bit systems, * then we could allocate it with vm_page_alloc() and all * the stuff needed as other L2 page table pages. * (2) Note that a process PT2TAB is special L2 page table * page. Its mapping in kernel_arena is permanent and can * be used no matter which process is current. Its mapping * in PT2MAP can be used only for current process. */ pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena, NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); if (pmap->pm_pt2tab == NULL) { /* * QQQ: As struct pmap is allocated from UMA with * UMA_ZONE_NOFREE flag, it's important to leave * no allocation in pmap if initialization failed. */ kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1, NB_IN_PT1); pmap->pm_pt1 = NULL; return (0); } /* * QQQ: Each L2 page table page vm_page_t has pindex set to * pte1 index of virtual address mapped by this page. * It's not valid for non kernel PT2TABs themselves. * The pindex of these pages can not be altered because * of the way how they are allocated now. However, it * should not be a problem. */ } mtx_lock_spin(&allpmaps_lock); /* * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), * kernel_vm_end_new is used here instead of kernel_vm_end. */ pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, kernel_vm_end_new - 1); pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 0xFFFFFFFF); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, kernel_vm_end_new - 1); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 0xFFFFFFFF); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. * I.e. self reference mapping. The PT2TAB is private, however mapped * into shared PT2MAP space, so the mapping should be not global. */ pt2tab_pa = vtophys(pmap->pm_pt2tab); pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* Insert PT2MAP PT2s into pmap PT1. */ pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Now synchronize new mapping which was made above. */ pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } #ifdef INVARIANTS static boolean_t pt2tab_user_is_empty(pt2_entry_t *tab) { u_int i, end; end = pt2tab_index(VM_MAXUSER_ADDRESS); for (i = 0; i < end; i++) if (tab[i] != 0) return (FALSE); return (TRUE); } #endif /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { #ifdef INVARIANTS vm_offset_t start, end; #endif KASSERT(pmap->pm_stats.resident_count == 0, ("%s: pmap resident count %ld != 0", __func__, pmap->pm_stats.resident_count)); KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), ("%s: has allocated user PT2(s)", __func__)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); #ifdef INVARIANTS start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); bzero((char *)pmap->pm_pt1 + start, end - start); start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); bzero((char *)pmap->pm_pt2tab + start, end - start); #endif /* * We are leaving PT1 and PT2TAB allocated on released pmap, * so hopefully UMA vmspace_zone will always be inited with * UMA_ZONE_NOFREE flag. */ } /********************************************************* * * L2 table pages and their pages management routines. * *********************************************************/ /* * Virtual interface for L2 page table wire counting. * * Each L2 page table in a page has own counter which counts a number of * valid mappings in a table. Global page counter counts mappings in all * tables in a page plus a single itself mapping in PT2TAB. * * During a promotion we leave the associated L2 page table counter * untouched, so the table (strictly speaking a page which holds it) * is never freed if promoted. * * If a page m->wire_count == 1 then no valid mappings exist in any L2 page * table in the page and the page itself is only mapped in PT2TAB. */ static __inline void pt2_wirecount_init(vm_page_t m) { u_int i; /* * Note: A page m is allocated with VM_ALLOC_WIRED flag and * m->wire_count should be already set correctly. * So, there is no need to set it again herein. */ for (i = 0; i < NPT2_IN_PG; i++) m->md.pt2_wirecount[i] = 0; } static __inline void pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) { /* * Note: A just modificated pte2 (i.e. already allocated) * is acquiring one extra reference which must be * explicitly cleared. It influences the KASSERTs herein. * All L2 page tables in a page always belong to the same * pmap, so we allow only one extra reference for the page. */ KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), ("%s: PT2 is overflowing ...", __func__)); KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowing ...", __func__)); m->wire_count++; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; } static __inline void pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) { KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, ("%s: PT2 is underflowing ...", __func__)); KASSERT(m->wire_count > 1, ("%s: PT2PG is underflowing ...", __func__)); m->wire_count--; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; } static __inline void pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) { KASSERT(count <= NPTE2_IN_PT2, ("%s: invalid count %u", __func__, count)); KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; m->wire_count += count; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); } static __inline uint32_t pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) { return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); } static __inline boolean_t pt2_is_empty(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); } static __inline boolean_t pt2_is_full(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == NPTE2_IN_PT2); } static __inline boolean_t pt2pg_is_empty(vm_page_t m) { return (m->wire_count == 1); } /* * This routine is called if the L2 page table * is not mapped correctly. */ static vm_page_t _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { uint32_t pte1_idx; pt1_entry_t *pte1p; pt2_entry_t pte2; vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pte1_idx = pte1_index(va); pte1p = pmap->pm_pt1 + pte1_idx; KASSERT(pte1_load(pte1p) == 0, ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, pte1_load(pte1p))); pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into pmap PT2TAB. */ m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, * the L2 page table page may have been allocated. */ return (NULL); } pmap->pm_stats.resident_count++; pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { pt2pg_pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pt2pg_pa); } pt2_wirecount_inc(m, pte1_idx); pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); return (m); } static vm_page_t pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { u_int pte1_idx; pt1_entry_t *pte1p, pte1; vm_page_t m; pte1_idx = pte1_index(va); retry: pte1p = pmap->pm_pt1 + pte1_idx; pte1 = pte1_load(pte1p); /* * This supports switching from a 1MB page to a * normal 4K page. */ if (pte1_is_section(pte1)) { (void)pmap_demote_pte1(pmap, pte1p, va); /* * Reload pte1 after demotion. * * Note: Demotion can even fail as either PT2 is not find for * the virtual address or PT2PG can not be allocated. */ pte1 = pte1_load(pte1p); } /* * If the L2 page table page is mapped, we just increment the * hold count, and activate it. */ if (pte1_is_link(pte1)) { m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(m, pte1_idx); } else { /* * Here if the PT2 isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte2(pmap, va, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused L2 page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) { /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ #ifdef PMAP_DEBUG pmap_zero_page_check(m); #endif m->flags |= PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Unwire L2 page tables page. */ static void pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt1_entry_t *pte1p, opte1 __unused; pt2_entry_t *pte2p; uint32_t i; KASSERT(pt2pg_is_empty(m), ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); /* * Unmap all L2 page tables in the page from L1 page table. * * QQQ: Individual L2 page tables (except the last one) can be unmapped * earlier. However, we are doing that this way. */ KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); pte1p = pmap->pm_pt1 + m->pindex; for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { KASSERT(m->md.pt2_wirecount[i] == 0, ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); opte1 = pte1_load(pte1p); if (pte1_is_link(opte1)) { pte1_clear(pte1p); /* * Flush intermediate TLB cache. */ pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); } #ifdef INVARIANTS else KASSERT((opte1 == 0) || pte1_is_section(opte1), ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, pmap, va, opte1, i)); #endif } /* * Unmap the page from PT2TAB. */ pte2p = pmap_pt2tab_entry(pmap, va); (void)pt2tab_load_clear(pte2p); pmap_tlb_flush(pmap, pt2map_pt2pg(va)); m->wire_count = 0; pmap->pm_stats.resident_count--; /* * This is a release store so that the ordinary store unmapping * the L2 page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); } /* * Decrements a L2 page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static __inline boolean_t pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { pt2_wirecount_dec(m, pte1_index(va)); if (pt2pg_is_empty(m)) { /* * QQQ: Wire count is zero, so whole page should be zero and * we can set PG_ZERO flag to it. * Note that when promotion is enabled, it takes some * more efforts. See pmap_unwire_pt2_all() below. */ pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); return (TRUE); } else return (FALSE); } /* * Drop a L2 page table page's wire count at once, which is used to record * the number of valid L2 page table entries within the page. If the wire * count drops to zero, then the L2 page table page is unmapped. */ static __inline void pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { u_int pte1_idx = pte1_index(va); KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, pt2_wirecount_get(m, pte1_idx))); /* * It's possible that the L2 page table was never used. * It happened in case that a section was created without promotion. */ if (pt2_is_full(m, va)) { pt2_wirecount_set(m, pte1_idx, 0); /* * QQQ: We clear L2 page table now, so when L2 page table page * is going to be freed, we can set it PG_ZERO flag ... * This function is called only on section mappings, so * hopefully it's not to big overload. * * XXX: If pmap is current, existing PT2MAP mapping could be * used for zeroing. */ pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); } #ifdef INVARIANTS else KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", __func__, pt2_wirecount_get(m, pte1_idx))); #endif if (pt2pg_is_empty(m)) { pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); } } /* * After removing a L2 page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static boolean_t pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt1_entry_t pte1; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (FALSE); pte1 = pte1_load(pmap_pte1(pmap, va)); mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); return (pmap_unwire_pt2(pmap, va, mpte, free)); } /************************************* * * Page management routines. * *************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * Is given page managed? */ static __inline bool is_managed(vm_paddr_t pa) { vm_page_t m; m = PHYS_TO_VM_PAGE(pa); if (m == NULL) return (false); return ((m->oflags & VPO_UNMANAGED) == 0); } static __inline bool pte1_is_managed(pt1_entry_t pte1) { return (is_managed(pte1_pa(pte1))); } static __inline bool pte2_is_managed(pt2_entry_t pte2) { return (is_managed(pte2_pa(pte2))); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pt1_entry_t *pte1p; pmap_t pmap; pt2_entry_t *pte2p, tpte2; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffs(inuse) - 1; pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) continue; pte2p = pmap_pte2(pmap, va); tpte2 = pte2_load(pte2p); if ((tpte2 & PTE2_W) == 0) tpte2 = pte2_load_clear(pte2p); pmap_pte2_release(pte2p); if ((tpte2 & PTE2_W) != 0) continue; KASSERT(tpte2 != 0, ("pmap_pv_reclaim: pmap %p va %#x zero pte", pmap, va)); pmap_tlb_flush(pmap, va); m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); if (pte2_is_dirty(tpte2)) vm_page_dirty(m); if ((tpte2 & PTE2_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt2(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); } /* * Free the pv_entry back to the free list. */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } /* * Get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffs(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the pte2list "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_pte2list_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } static void pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); /* * Transfer the 1mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pte1: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 1mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a section. */ static boolean_t pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } static inline void pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) { /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); else pmap_tlb_flush(pmap, pte1_trunc(va)); } /* * Update kernel pte1 on all pmaps. * * The following function is called only on one cpu with disabled interrupts. * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way * nobody can invoke explicit hardware table walk during the update of pte1. * Unsolicited hardware table walk can still happen, invoked by speculative * data or instruction prefetch or even by speculative hardware table walk. * * The break-before-make approach should be implemented here. However, it's * not so easy to do that for kernel mappings as it would be unhappy to unmap * itself unexpectedly but voluntarily. */ static void pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; /* * Get current pmap. Interrupts should be disabled here * so PCPU_GET() is done atomically. */ pmap = PCPU_GET(curpmap); if (pmap == NULL) pmap = kernel_pmap; /* * (1) Change pte1 on current pmap. * (2) Flush all obsolete TLB entries on current CPU. * (3) Change pte1 on all pmaps. * (4) Flush all obsolete TLB entries on all CPUs in SMP case. */ pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) { pmap_pte1_kern_promotions++; tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); } else { pmap_pte1_kern_demotions++; tlb_flush_local(pte1_trunc(va)); } /* * In SMP case, this function is called when all cpus are at smp * rendezvous, so there is no need to use 'allpmaps_lock' lock here. * In UP case, the function is called with this lock locked. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } #ifdef SMP /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) tlb_flush_range(pte1_trunc(va), PTE1_SIZE); else tlb_flush(pte1_trunc(va)); #endif } #ifdef SMP struct pte1_action { vm_offset_t va; pt1_entry_t npte1; u_int update; /* CPU that updates the PTE1 */ }; static void pmap_update_pte1_action(void *arg) { struct pte1_action *act = arg; if (act->update == PCPU_GET(cpuid)) pmap_update_pte1_kernel(act->va, act->npte1); } /* * Change pte1 on current pmap. * Note that kernel pte1 must be changed on all pmaps. * * According to the architecture reference manual published by ARM, * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. * According to this manual, UNPREDICTABLE behaviours must never happen in * a viable system. In contrast, on x86 processors, it is not specified which * TLB entry mapping the virtual address will be used, but the MMU doesn't * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone * Black). * * It's a problem when either promotion or demotion is being done. The pte1 * update and appropriate TLB flush must be done atomically in general. */ static void pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, pt1_entry_t npte1) { if (pmap == kernel_pmap) { struct pte1_action act; sched_pin(); act.va = va; act.npte1 = npte1; act.update = PCPU_GET(cpuid); - smp_rendezvous_cpus(all_cpus, smp_no_rendevous_barrier, + smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, pmap_update_pte1_action, NULL, &act); sched_unpin(); } else { register_t cspr; /* * Use break-before-make approach for changing userland * mappings. It can cause L1 translation aborts on other * cores in SMP case. So, special treatment is implemented * in pmap_fault(). To reduce the likelihood that another core * will be affected by the broken mapping, disable interrupts * until the mapping change is completed. */ cspr = disable_interrupts(PSR_I | PSR_F); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); restore_interrupts(cspr); } } #else static void pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, pt1_entry_t npte1) { if (pmap == kernel_pmap) { mtx_lock_spin(&allpmaps_lock); pmap_update_pte1_kernel(va, npte1); mtx_unlock_spin(&allpmaps_lock); } else { register_t cspr; /* * Use break-before-make approach for changing userland * mappings. It's absolutely safe in UP case when interrupts * are disabled. */ cspr = disable_interrupts(PSR_I | PSR_F); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); restore_interrupts(cspr); } } #endif #if VM_NRESERVLEVEL > 0 /* * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are * within a single page table page (PT2) to a single 1MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PTE1s are replicated in each pmap but * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only * read the PTE1 from the kernel pmap. */ static void pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t npte1; pt2_entry_t *fpte2p, fpte2, fpte2_fav; pt2_entry_t *pte2p, pte2; vm_offset_t pteva __unused; vm_page_t m __unused; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is * either invalid, unused, or does not map the first 4KB physical page * within a 1MB page. */ fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); fpte2 = pte2_load(fpte2p); if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != (PTE2_A | PTE2_V)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", __func__, va, pmap); return; } if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", __func__, va, pmap); return; } if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set without * a TLB invalidation. */ fpte2 |= PTE2_RO; pte2_store(fpte2p, fpte2); } /* * Examine each of the other PTE2s in the specified PT2. Abort if this * PTE2 maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE2. */ fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { pte2 = pte2_load(pte2p); if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", __func__, va, pmap); return; } if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set * without a TLB invalidation. See note above. */ pte2 |= PTE2_RO; pte2_store(pte2p, pte2); pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & PTE2_FRAME); CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", __func__, pteva, pmap); } if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", __func__, va, pmap); return; } fpte2_fav -= PTE2_SIZE; } /* * The page table page in its current state will stay in PT2TAB * until the PTE1 mapping the section is demoted by pmap_demote_pte1() * or destroyed by pmap_remove_pte1(). * * Note that L2 page table size is not equal to PAGE_SIZE. */ m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: PT2 page is out of range", __func__)); KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); /* * Get pte1 from pte2 format. */ npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; /* * Promote the pv entries. */ if (pte2_is_managed(fpte2)) pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); /* * Promote the mappings. */ pmap_change_pte1(pmap, pte1p, va, npte1); pmap_pte1_promotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Zero L2 page table page. */ static __inline void pmap_clear_pt2(pt2_entry_t *fpte2p) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) pte2_clear(pte2p); } /* * Removes a 1MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { vm_page_t m; uint32_t pte1_idx; pt2_entry_t *fpte2p; vm_paddr_t pt2_pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); m = pmap_pt2_page(pmap, va); if (m == NULL) /* * QQQ: Is this function called only on promoted pte1? * We certainly do section mappings directly * (without promotion) in kernel !!! */ panic("%s: missing pt2 page", __func__); pte1_idx = pte1_index(va); /* * Initialize the L2 page table. */ fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); pmap_clear_pt2(fpte2p); /* * Remove the mapping. */ pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); /* * QQQ: We do not need to invalidate PT2MAP mapping * as we did not change it. I.e. the L2 page table page * was and still is mapped the same way. */ } /* * Do the things to unmap a section in a process */ static void pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, struct spglist *free) { pt1_entry_t opte1; struct md_page *pvh; vm_offset_t eva, va; vm_page_t m; PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); /* * Clear and invalidate the mapping. It should occupy one and only TLB * entry. So, pmap_tlb_flush() called with aligned address should be * sufficient. */ opte1 = pte1_load_clear(pte1p); pmap_tlb_flush(pmap, sva); if (pte1_is_wired(opte1)) pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; if (pte1_is_managed(opte1)) { pvh = pa_to_pvh(pte1_pa(opte1)); pmap_pvh_free(pvh, pmap, sva); eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) { if (pte1_is_dirty(opte1)) vm_page_dirty(m); if (opte1 & PTE1_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { /* * L2 page table(s) can't be removed from kernel map as * kernel counts on it (stuff around pmap_growkernel()). */ pmap_remove_kernel_pte1(pmap, pte1p, sva); } else { /* * Get associated L2 page table page. * It's possible that the page was never allocated. */ m = pmap_pt2_page(pmap, sva); if (m != NULL) pmap_unwire_pt2_all(pmap, sva, m, free); } } /* * Fills L2 page table page with mappings to consecutive physical pages. */ static __inline void pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { pte2_store(pte2p, npte2); npte2 += PTE2_SIZE; } } /* * Tries to demote a 1MB page mapping. If demotion fails, the * 1MB page mapping is invalidated. */ static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t opte1, npte1; pt2_entry_t *fpte2p, npte2; vm_paddr_t pt2pg_pa, pt2_pa; vm_page_t m; struct spglist free; uint32_t pte1_idx, isnew = 0; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); opte1 = pte1_load(pte1p); KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { KASSERT(!pte1_is_wired(opte1), ("%s: PT2 page for a wired mapping is missing", __func__)); /* * Invalidate the 1MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); pmap_free_zero_pages(&free); CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", __func__, va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; isnew = 1; /* * We init all L2 page tables in the page even if * we are going to change everything for one L2 page * table in a while. */ pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { if (va < VM_MAXUSER_ADDRESS) { if (pt2_is_empty(m, va)) isnew = 1; /* Demoting section w/o promotion. */ #ifdef INVARIANTS else KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" " count %u", __func__, pt2_wirecount_get(m, pte1_index(va)))); #endif } } pt2pg_pa = VM_PAGE_TO_PHYS(m); pte1_idx = pte1_index(va); /* * If the pmap is current, then the PT2MAP can provide access to * the page table page (promoted L2 page tables are not unmapped). * Otherwise, temporarily map the L2 page table page (m) into * the kernel's address space at either PADDR1 or PADDR2. * * Note that L2 page table size is not equal to PAGE_SIZE. */ if (pmap_is_current(pmap)) fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); } else { mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); } pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); npte1 = PTE1_LINK(pt2_pa); KASSERT((opte1 & PTE1_A) != 0, ("%s: opte1 is missing PTE1_A", __func__)); KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, ("%s: opte1 has PTE1_NM", __func__)); /* * Get pte2 from pte1 format. */ npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; /* * If the L2 page table page is new, initialize it. If the mapping * has changed attributes, update the page table entries. */ if (isnew != 0) { pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); pmap_fill_pt2(fpte2p, npte2); } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != (npte2 & PTE2_PROMOTE)) pmap_fill_pt2(fpte2p, npte2); KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), ("%s: fpte2p and npte2 map different physical addresses", __func__)); if (fpte2p == PADDR2) mtx_unlock(&PMAP2mutex); /* * Demote the mapping. This pmap is locked. The old PTE1 has * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also * has not PTE1_NM set. Thus, there is no danger of a race with * another processor changing the setting of PTE1_A and/or PTE1_NM * between the read above and the store below. */ pmap_change_pte1(pmap, pte1p, va, npte1); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_pv_reclaim(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PTE1 must have already changed from mapping * the 1mpage to referencing the page table page. */ if (pte1_is_managed(opte1)) pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); pmap_pte1_demotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); return (TRUE); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pt2_entry_t npte2, opte2; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte2, om; boolean_t wired; va = trunc_page(va); mpte2 = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte2 = pmap_allocpte2(pmap, va, flags); if (mpte2 == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte2 failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) panic("%s: attempted on 1MB page", __func__); pte2p = pmap_pte2_quick(pmap, va); if (pte2p == NULL) panic("%s: invalid L1 page table entry va=%#x", __func__, va); om = NULL; pa = VM_PAGE_TO_PHYS(m); opte2 = pte2_load(pte2p); opa = pte2_pa(opte2); /* * Mapping has not changed, must be protection or wiring change. */ if (pte2_is_valid(opte2) && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT2 pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT2 page will be also. */ if (wired && !pte2_is_wired(opte2)) pmap->pm_stats.wired_count++; else if (!wired && pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; /* * Remove extra pte2 reference */ if (mpte2) pt2_wirecount_dec(mpte2, pte1_index(va)); if (pte2_is_managed(opte2)) om = m; goto validate; } /* * QQQ: We think that changing physical address on writeable mapping * is not safe. Well, maybe on kernel address space with correct * locking, it can make a sense. However, we have no idea why * anyone should do that on user address space. Are we wrong? */ KASSERT((opa == 0) || (opa == pa) || !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", __func__, pmap, va, opte2, opa, pa, flags, prot)); pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; if (pte2_is_managed(opte2)) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } /* * Remove extra pte2 reference */ if (mpte2 != NULL) pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("%s: managed mapping within the clean submap", __func__)); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte2 = PTE2(pa, PTE2_NM, vm_page_pte2_attr(m)); if (prot & VM_PROT_WRITE) { if (pte2_is_managed(npte2)) vm_page_aflag_set(m, PGA_WRITEABLE); } else npte2 |= PTE2_RO; if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; if (wired) npte2 |= PTE2_W; if (va < VM_MAXUSER_ADDRESS) npte2 |= PTE2_U; if (pmap != kernel_pmap) npte2 |= PTE2_NG; /* * If the mapping or permission bits are different, we need * to update the pte2. * * QQQ: Think again and again what to do * if the mapping is going to be changed! */ if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, there is a race * for other threads of current process in lazy loading case. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). * * QQQ: (1) Does it exist any better way where * or how to sync icache? * (2) Now, we do it on a page basis. */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pat_mode == VM_MEMATTR_WB_WA && (opa != pa || (opte2 & PTE2_NX))) cache_icache_sync_fresh(va, pa, PAGE_SIZE); npte2 |= PTE2_A; if (flags & VM_PROT_WRITE) npte2 &= ~PTE2_NM; if (opte2 & PTE2_V) { /* Change mapping with break-before-make approach. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); pte2_store(pte2p, npte2); if (opte2 & PTE2_A) { if (pte2_is_managed(opte2)) vm_page_aflag_set(om, PGA_REFERENCED); } if (pte2_is_dirty(opte2)) { if (pte2_is_managed(opte2)) vm_page_dirty(om); } if (pte2_is_managed(opte2) && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else pte2_store(pte2p, npte2); } #if 0 else { /* * QQQ: In time when both access and not mofified bits are * emulated by software, this should not happen. Some * analysis is need, if this really happen. Missing * tlb flush somewhere could be the reason. */ panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, va, opte2, npte2); } #endif #if VM_NRESERVLEVEL > 0 /* * If both the L2 page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pte1(pmap, pte1p, va); #endif sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Do the things to unmap a page in a process. */ static int pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, struct spglist *free) { pt2_entry_t opte2; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Clear and invalidate the mapping. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", __func__, pmap, va, opte2)); if (opte2 & PTE2_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte2_is_managed(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt2(pmap, va, free)); } /* * Remove a single page from a process address space. */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt2_entry_t *pte2p; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || !pte2_is_valid(pte2_load(pte2p))) return; pmap_remove_pte2(pmap, pte2p, va, free); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * Special handling of removing one page. A very common * operation and easy to short circuit some code. */ if (sva + PAGE_SIZE == eva) { pte1 = pte1_load(pmap_pte1(pmap, sva)); if (pte1_is_link(pte1)) { pmap_remove_page(pmap, sva, &free); goto out; } } for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; if (pmap->pm_stats.resident_count == 0) break; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that the L1 page * table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_remove_pte1(pmap, pte1p, sva, &free); continue; } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* The large page mapping was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion. */ pte1 = pte1_load(pte1p); } #endif } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being removed. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (pmap_remove_pte2(pmap, pte2p, sva, &free)) break; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt2_entry_t *pte2p, opte2; pt1_entry_t *pte1p; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " "a 1mpage in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, pv->pv_va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", __func__, pmap, pv->pv_va)); if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_unuse_pt2(pmap, pv->pv_va, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m, mt, mpt2pg; struct md_page *pvh; pa = pte1_pa(pte1); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte1 %#x", __func__, pte1)); if (pte1_is_dirty(pte1)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; pvh = pa_to_pvh(pa); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpt2pg = pmap_pt2_page(pmap, pv->pv_va); if (mpt2pg != NULL) pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m; struct md_page *pvh; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte2 %#x", __func__, pte2)); if (pte2_is_dirty(pte2)) vm_page_dirty(m); pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(pa); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt2(pmap, pv->pv_va, free); } /* * Remove all pages from specified address space this aids process * exit speeds. Also, this code is special cased for current process * only, but can have the more generic (and slightly slower) mode enabled. * This is much faster than pmap_remove in the case of running down * an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; pv_entry_t pv; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; boolean_t allfree; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), ("%s: non-current pmap %p", __func__, pmap)); #if defined(SMP) && defined(INVARIANTS) { cpuset_t other_cpus; sched_pin(); other_cpus = pmap->pm_active; CPU_CLR(PCPU_GET(cpuid), &other_cpus); sched_unpin(); KASSERT(CPU_EMPTY(&other_cpus), ("%s: pmap %p active on other cpus", __func__, pmap)); } #endif SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", __func__, pmap, pc->pc_pmap)); allfree = TRUE; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = ffs(inuse) - 1; bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; /* * Note that we cannot remove wired pages * from a process' mapping at this time */ pte1p = pmap_pte1(pmap, pv->pv_va); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) { allfree = FALSE; continue; } pte1_clear(pte1p); pmap_remove_pte1_quick(pmap, pte1, pv, &free); } else if (pte1_is_link(pte1)) { pte2p = pt2map_entry(pv->pv_va); pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) { printf("%s: pmap %p va %#x " "pte2 %#x\n", __func__, pmap, pv->pv_va, pte2); panic("bad pte2"); } if (pte2_is_wired(pte2)) { allfree = FALSE; continue; } pte2_clear(pte2p); pmap_remove_pte2_quick(pmap, pte2, pv, &free); } else { printf("%s: pmap %p va %#x pte1 %#x\n", __func__, pmap, pv->pv_va, pte1); panic("bad pte1"); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } tlb_flush_all_ng_local(); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * This code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No L2 page table pages. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpt2pg) { pt2_entry_t *pte2p, pte2; vm_paddr_t pa; struct spglist free; uint32_t l2prot; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("%s: managed mapping within the clean submap", __func__)); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a L2 page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int pte1_idx; pt1_entry_t pte1, *pte1p; vm_paddr_t pt2_pa; /* * Get L1 page table things. */ pte1_idx = pte1_index(va); pte1p = pmap_pte1(pmap, va); pte1 = pte1_load(pte1p); if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { /* * Each of NPT2_IN_PG L2 page tables on the page can * come here. Make sure that associated L1 page table * link is established. * * QQQ: It comes that we don't establish all links to * L2 page tables for newly allocated L2 page * tables page. */ KASSERT(!pte1_is_section(pte1), ("%s: pte1 %#x is section", __func__, pte1)); if (!pte1_is_link(pte1)) { pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } pt2_wirecount_inc(mpt2pg, pte1_idx); } else { /* * If the L2 page table page is mapped, we just * increment the hold count, and activate it. */ if (pte1_is_section(pte1)) { return (NULL); } else if (pte1_is_link(pte1)) { mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(mpt2pg, pte1_idx); } else { mpt2pg = _pmap_allocpte2(pmap, va, PMAP_ENTER_NOSLEEP); if (mpt2pg == NULL) return (NULL); } } } else { mpt2pg = NULL; } /* * This call to pt2map_entry() makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte2_quick(). * But that isn't as quick as pt2map_entry(). */ pte2p = pt2map_entry(va); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { if (mpt2pg != NULL) { /* * Remove extra pte2 reference */ pt2_wirecount_dec(mpt2pg, pte1_index(va)); mpt2pg = NULL; } return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpt2pg != NULL) { SLIST_INIT(&free); if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { pmap_tlb_flush(pmap, va); pmap_free_zero_pages(&free); } mpt2pg = NULL; } return (NULL); } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ pa = VM_PAGE_TO_PHYS(m); l2prot = PTE2_RO | PTE2_NM; if (va < VM_MAXUSER_ADDRESS) l2prot |= PTE2_U | PTE2_NG; if ((prot & VM_PROT_EXECUTE) == 0) l2prot |= PTE2_NX; else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ cache_icache_sync_fresh(va, pa, PAGE_SIZE); } pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); return (mpt2pg); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Tries to create 1MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pt1_entry_t *pte1p; vm_paddr_t pa; uint32_t l1prot; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pte1p = pmap_pte1(pmap, va); if (pte1_is_valid(pte1_load(pte1p))) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (FALSE); } if ((m->oflags & VPO_UNMANAGED) == 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (FALSE); } } /* * Increment counters. */ pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; /* * Map the section. * * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is * made readonly? */ pa = VM_PAGE_TO_PHYS(m); l1prot = PTE1_RO | PTE1_NM; if (va < VM_MAXUSER_ADDRESS) l1prot |= PTE1_U | PTE1_NG; if ((prot & VM_PROT_EXECUTE) == 0) l1prot |= PTE1_NX; else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ cache_icache_sync_fresh(va, pa, PTE1_SIZE); } pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(vm_page_pte2_attr(m)))); pmap_pte1_mappings++; CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpt2pg; vm_pindex_t diff, psize; PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", __func__, pmap, start, end, m_start, prot)); VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpt2pg = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && m->psind == 1 && sp_enabled && pmap_enter_pte1(pmap, va, m, prot)) m = &m[PTE1_SIZE / PAGE_SIZE - 1]; else mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, mpt2pg); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pt1_entry_t *pte1p; vm_paddr_t pa, pte2_pa; vm_page_t p; vm_memattr_t pat_mode; u_int l1attr, l1prot; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("%s: non-device object", __func__)); if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 1MB page boundary. */ pte2_pa = VM_PAGE_TO_PHYS(p); if (pte2_pa & PTE1_OFFSET) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 1MB pages. * * QQQ: Well, we are mapping a section, so same condition must * be hold like during promotion. It looks that only RW mapping * is done here, so readonly mapping must be done elsewhere. */ l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); PMAP_LOCK(pmap); for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { pte1p = pmap_pte1(pmap, addr); if (!pte1_is_valid(pte1_load(pte1p))) { pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } /* Else continue on if the PTE1 is already valid. */ addr += PTE1_SIZE; } PMAP_UNLOCK(pmap); } } /* * Do the things to protect a 1mpage in a process. */ static void pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, vm_prot_t prot) { pt1_entry_t npte1, opte1; vm_offset_t eva, va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); opte1 = npte1 = pte1_load(pte1p); if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) npte1 |= PTE1_RO | PTE1_NM; if ((prot & VM_PROT_EXECUTE) == 0) npte1 |= PTE1_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte1 != opte1) { pte1_store(pte1p, npte1); pmap_tlb_flush(pmap, sva); } } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { boolean_t pv_lists_locked; vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, opte2, npte2; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_protect_pte1(pmap, pte1p, sva, prot); continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping * was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { vm_page_t m; opte2 = npte2 = pte2_load(pte2p); if (!pte2_is_valid(opte2)) continue; if ((prot & VM_PROT_WRITE) == 0) { if (pte2_is_managed(opte2) && pte2_is_dirty(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); vm_page_dirty(m); } npte2 |= PTE2_RO | PTE2_NM; } if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte2 != opte2) { pte2_store(pte2p, npte2); pmap_tlb_flush(pmap, sva); } } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt1_entry_t pte1; pt2_entry_t pte2; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) count++; } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); if (pte2_is_wired(pte2)) count++; } PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 1mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = pte1_is_dirty(pte1); } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = pte2_is_dirty(pte2); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTE2s can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt1_entry_t pte1; pt2_entry_t pte2; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, addr)); if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(addr)); rv = !pte2_is_valid(pte2) ; } PMAP_UNLOCK(pmap); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 1mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); } else { pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); opte1 = pte1_load(pte1p); if (pte1_is_dirty(opte1)) { /* * Although "opte1" is mapping a 1MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((opte1 & PTE1_A) != 0) { /* * Since this reference bit is shared by 256 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual section number, and the pmap * address to select one 4KB page out of the 256 * on which testing the reference bit will result * in clearing that bit. This function is designed * to avoid the selection of the same 4KB page * for every 1MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the section is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && !pte1_is_wired(opte1)) { pte1_clear_bit(pte1p, PTE1_A); pmap_tlb_flush(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(pte1_is_link(pte1_load(pte1p)), ("%s: not found a link in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load(pte2p); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if ((opte2 & PTE2_A) != 0) { pte2_clear_bit(pte2p, PTE2_A); pmap_tlb_flush(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { if (!pte1_is_wired(pte1)) panic("%s: pte1 %#x not wired", __func__, pte1); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pte1_clear_bit(pte1p, PTE1_W); pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) panic("%s: demotion failed", __func__); #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (!pte2_is_wired(pte2)) panic("%s: pte2 %#x is missing PTE2_W", __func__, pte2); /* * PTE2_W must be cleared atomically. Although the pmap * lock synchronizes access to PTE2_W, another processor * could be changing PTE2_NM and/or PTE2_A concurrently. */ pte2_clear_bit(pte2p, PTE2_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); if (!(pte1_load(pte1p) & PTE1_RO)) (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load(pte2p); if (!(opte2 & PTE2_RO)) { pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, pte2; vm_offset_t pdnxt; vm_page_t m; boolean_t pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = pte1_trunc(sva + PTE1_SIZE); if (pdnxt < sva) pdnxt = eva; pte1p = pmap_pte1(pmap, sva); opte1 = pte1_load(pte1p); if (!pte1_is_valid(opte1)) /* XXX */ continue; else if (pte1_is_section(opte1)) { if (!pte1_is_managed(opte1)) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying L2 page * table is fully populated, this removal never * frees a L2 page table page. */ if (!pte1_is_wired(opte1)) { pte2p = pmap_pte2_quick(pmap, sva); KASSERT(pte2_is_valid(pte2_load(pte2p)), ("%s: invalid PTE2", __func__)); pmap_remove_pte2(pmap, pte2p, sva, NULL); } } if (pdnxt > eva) pdnxt = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) continue; else if (pte2_is_dirty(pte2)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); vm_page_dirty(m); } pte2_set_bit(pte2p, PTE2_NM); pte2_clear_bit(pte2p, PTE2_A); } else if ((pte2 & PTE2_A) != 0) pte2_clear_bit(pte2p, PTE2_A); else continue; pmap_tlb_flush(pmap, sva); } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("%s: page %p is exclusive busy", __func__, m)); /* * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM * cleared. If the object containing the page is locked and the page * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently * set. */ if ((m->flags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); opte1 = pte1_load(pte1p); if (!(opte1 & PTE1_RO)) { if (pmap_demote_pte1(pmap, pte1p, va) && !pte1_is_wired(opte1)) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); pte2p = pmap_pte2_quick(pmap, va); opte2 = pte2_load(pte2p); if ((opte2 & PTE2_V)) { pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); vm_page_dirty(m); pmap_tlb_flush(pmap, va); } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); if (pte2_is_dirty(pte2_load(pte2p))) { pte2_set_bit(pte2p, PTE2_NM); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { pt2_entry_t *cmap2_pte2p; vm_memattr_t oma; vm_paddr_t pa; struct pcpu *pc; oma = m->md.pat_mode; m->md.pat_mode = ma; CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); if ((m->flags & PG_FICTITIOUS) != 0) return; #if 0 /* * If "m" is a normal page, flush it from the cache. * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m, oma)) return; #endif /* * If page is not mapped by sf buffer, map the page * transient and do invalidation. */ if (ma != oma) { pa = VM_PAGE_TO_PHYS(m); sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_memattr_to_pte2(ma))); dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } } /* * Miscellaneous support routines follow */ /* * Returns TRUE if the given page is mapped individually or as part of * a 1mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); pagezero(pc->pc_cmap2_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap2_addr); else bzero(pc->pc_cmap2_addr + off, size); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (pte2_load(CMAP3) != 0) panic("%s: CMAP3 busy", __func__); sched_pin(); pte2_store(CMAP3, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); pagezero(CADDR3); pte2_clear(CMAP3); tlb_flush((vm_offset_t)CADDR3); sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap1_pte2p = pc->pc_cmap1_pte2p; cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap1_pte2p) != 0) panic("%s: CMAP1 busy", __func__); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), PTE2_AP_KRW, vm_page_pte2_attr(dst))); bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); pte2_clear(cmap1_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap1_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap1_pte2p = pc->pc_cmap1_pte2p; cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap1_pte2p) != 0) panic("pmap_copy_pages: CMAP1 busy"); if (pte2_load(cmap2_pte2p) != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); a_cp = pc->pc_cmap1_addr + a_pg_offset; b_cp = pc->pc_cmap2_addr + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } pte2_clear(cmap1_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap1_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } vm_offset_t pmap_quick_enter_page(vm_page_t m) { struct pcpu *pc; pt2_entry_t *pte2p; critical_enter(); pc = get_pcpu(); pte2p = pc->pc_qmap_pte2p; KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); return (pc->pc_qmap_addr); } void pmap_quick_remove_page(vm_offset_t addr) { struct pcpu *pc; pt2_entry_t *pte2p; pc = get_pcpu(); pte2p = pc->pc_qmap_pte2p; KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); pte2_clear(pte2p); tlb_flush(pc->pc_qmap_addr); critical_exit(); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t nextva; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = nextva) { pt2_entry_t *src_pte2p, *dst_pte2p; vm_page_t dst_mpt2pg, src_mpt2pg; pt1_entry_t src_pte1; u_int pte1_idx; KASSERT(addr < VM_MAXUSER_ADDRESS, ("%s: invalid to pmap_copy page tables", __func__)); nextva = pte1_trunc(addr + PTE1_SIZE); if (nextva < addr) nextva = end_addr; pte1_idx = pte1_index(addr); src_pte1 = src_pmap->pm_pt1[pte1_idx]; if (pte1_is_section(src_pte1)) { if ((addr & PTE1_OFFSET) != 0 || (addr + PTE1_SIZE) > end_addr) continue; if (dst_pmap->pm_pt1[pte1_idx] == 0 && (!pte1_is_managed(src_pte1) || pmap_pv_insert_pte1(dst_pmap, addr, pte1_pa(src_pte1)))) { dst_pmap->pm_pt1[pte1_idx] = src_pte1 & ~PTE1_W; dst_pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } continue; } else if (!pte1_is_link(src_pte1)) continue; src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); /* * We leave PT2s to be linked from PT1 even if they are not * referenced until all PT2s in a page are without reference. * * QQQ: It could be changed ... */ #if 0 /* single_pt2_link_is_cleared */ KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, ("%s: source page table page is unused", __func__)); #else if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) continue; #endif if (nextva > end_addr) nextva = end_addr; src_pte2p = pt2map_entry(addr); while (addr < nextva) { pt2_entry_t temp_pte2; temp_pte2 = pte2_load(src_pte2p); /* * we only virtual copy managed pages */ if (pte2_is_managed(temp_pte2)) { dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dst_mpt2pg == NULL) goto out; dst_pte2p = pmap_pte2_quick(dst_pmap, addr); if (!pte2_is_valid(pte2_load(dst_pte2p)) && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ temp_pte2 &= ~(PTE2_W | PTE2_A); temp_pte2 |= PTE2_NM; pte2_store(dst_pte2p, temp_pte2); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_pt2(dst_pmap, addr, dst_mpt2pg, &free)) { pmap_tlb_flush(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= pt2_wirecount_get(src_mpt2pg, pte1_idx)) break; } addr += PAGE_SIZE; src_pte2p++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more section mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t pte1_offset; if (size < PTE1_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); pte1_offset = offset & PTE1_OFFSET; if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || (*addr & PTE1_OFFSET) == pte1_offset) return; if ((*addr & PTE1_OFFSET) < pte1_offset) *addr = pte1_trunc(*addr) + pte1_offset; else *addr = pte1_roundup(*addr) + pte1_offset; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid, ttb; PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif ttb = pmap_ttb_get(pmap); /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_pagedir = ttb; cp15_ttbr_set(ttb); PCPU_SET(curpmap, pmap); critical_exit(); } /* * Perform the pmap work for mincore. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; vm_paddr_t pa; bool managed; int val; PMAP_LOCK(pmap); retry: pte1p = pmap_pte1(pmap, addr); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); managed = pte1_is_managed(pte1); val = MINCORE_SUPER | MINCORE_INCORE; if (pte1_is_dirty(pte1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte1 & PTE1_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, addr); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); pa = pte2_pa(pte2); managed = pte2_is_managed(pte2); val = MINCORE_INCORE; if (pte2_is_dirty(pte2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte2 & PTE2_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { vm_offset_t sva; uint32_t l2attr; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); while (size != 0) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { vm_offset_t sva; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; while (size != 0) { pmap_kremove(va); va += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) { pcb->pcb_pagedir = pmap_ttb_get(pmap); } /* * Clean L1 data cache range by physical address. * The range must be within a single page. */ static void pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, ("%s: not on single page", __func__)); sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Sync instruction cache range which is not mapped yet. */ void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { uint32_t len, offset; vm_page_t m; /* Write back d-cache on given address range. */ offset = pa & PAGE_MASK; for ( ; size != 0; size -= len, pa += len, offset = 0) { len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) { /* Write back d-cache on given address range. */ if (va >= VM_MIN_KERNEL_ADDRESS) { dcache_wb_pou(va, size); } else { uint32_t len, offset; vm_paddr_t pa; vm_page_t m; offset = va & PAGE_MASK; for ( ; size != 0; size -= len, va += len, offset = 0) { pa = pmap_extract(pmap, va); /* offset is preserved */ len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } /* * The implementation of pmap_fault() uses IN_RANGE2() macro which * depends on the fact that given range size is a power of 2. */ CTASSERT(powerof2(NB_IN_PT1)); CTASSERT(powerof2(PT2MAP_SIZE)); #define IN_RANGE2(addr, start, size) \ ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) /* * Handle access and R/W emulation faults. */ int pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; if (pmap == NULL) pmap = kernel_pmap; /* * In kernel, we should never get abort with FAR which is in range of * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here * and print out a useful abort message and even get to the debugger * otherwise it likely ends with never ending loop of aborts. */ if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { /* * All L1 tables should always be mapped and present. * However, we check only current one herein. For user mode, * only permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", __func__, pmap, pmap->pm_pt1, far); panic("%s: pm_pt1 abort", __func__); } return (KERN_INVALID_ADDRESS); } if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { /* * PT2MAP should be always mapped and present in current * L1 table. However, only existing L2 tables are mapped * in PT2MAP. For user mode, only L2 translation abort and * permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", __func__, pmap, PT2MAP, far); panic("%s: PT2MAP abort", __func__); } return (KERN_INVALID_ADDRESS); } /* * A pmap lock is used below for handling of access and R/W emulation * aborts. They were handled by atomic operations before so some * analysis of new situation is needed to answer the following question: * Is it safe to use the lock even for these aborts? * * There may happen two cases in general: * * (1) Aborts while the pmap lock is locked already - this should not * happen as pmap lock is not recursive. However, under pmap lock only * internal kernel data should be accessed and such data should be * mapped with A bit set and NM bit cleared. If double abort happens, * then a mapping of data which has caused it must be fixed. Further, * all new mappings are always made with A bit set and the bit can be * cleared only on managed mappings. * * (2) Aborts while another lock(s) is/are locked - this already can * happen. However, there is no difference here if it's either access or * R/W emulation abort, or if it's some other abort. */ PMAP_LOCK(pmap); #ifdef SMP /* * Special treatment is due to break-before-make approach done when * pte1 is updated for userland mapping during section promotion or * demotion. If not caught here, pmap_enter() can find a section * mapping on faulting address. That is not allowed. */ if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } #endif /* * Accesss bits for page and section. Note that the entry * is not in TLB yet, so TLB flush is not necessary. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. */ if (idx == FAULT_ACCESS_L2) { pte2p = pt2map_entry(far); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { pte2_store(pte2p, pte2 | PTE2_A); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if (idx == FAULT_ACCESS_L1) { pte1p = pmap_pte1(pmap, far); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pte1_store(pte1p, pte1 | PTE1_A); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * Handle modify bits for page and section. Note that the modify * bit is emulated by software. So PTEx_RO is software read only * bit and PTEx_NM flag is real hardware read only bit. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. */ if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { pte2p = pt2map_entry(far); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && (pte2 & PTE2_NM)) { pte2_store(pte2p, pte2 & ~PTE2_NM); tlb_flush(trunc_page(far)); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { pte1p = pmap_pte1(pmap, far); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1) && !(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { pte1_store(pte1p, pte1 & ~PTE1_NM); tlb_flush(pte1_trunc(far)); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * QQQ: The previous code, mainly fast handling of access and * modify bits aborts, could be moved to ASM. Now we are * starting to deal with not fast aborts. */ #ifdef INVARIANTS /* * Read an entry in PT2TAB associated with both pmap and far. * It's safe because PT2TAB is always mapped. */ pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); if (pte2_is_valid(pte2)) { /* * Now, when we know that L2 page table is allocated, * we can use PT2MAP to get L2 page table entry. */ pte2 = pte2_load(pt2map_entry(far)); if (pte2_is_valid(pte2)) { /* * If L2 page table entry is valid, make sure that * L1 page table entry is valid too. Note that we * leave L2 page entries untouched when promoted. */ pte1 = pte1_load(pmap_pte1(pmap, far)); if (!pte1_is_valid(pte1)) { panic("%s: missing L1 page entry (%p, %#x)", __func__, pmap, far); } } } #endif PMAP_UNLOCK(pmap); return (KERN_FAILURE); } #if defined(PMAP_DEBUG) /* * Reusing of KVA used in pmap_zero_page function !!! */ static void pmap_zero_page_check(vm_page_t m) { pt2_entry_t *cmap2_pte2p; uint32_t *p, *end; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) if (*p != 0) panic("%s: page %p not zero, va: %p", __func__, m, pc->pc_cmap2_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte2 = 0; int i, j, index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid || p->p_vmspace == NULL) continue; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPTE1_IN_PT1; i++) { pt1_entry_t pte1; pt2_entry_t *pte2p, pte2; vm_offset_t base, va; vm_paddr_t pa; vm_page_t m; base = i << PTE1_SHIFT; pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1_is_section(pte1)) { /* * QQQ: Do something here! */ } else if (pte1_is_link(pte1)) { for (j = 0; j < NPTE2_IN_PT2; j++) { va = base + (j << PAGE_SHIFT); if (va >= VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte2); } pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (!pte2_is_valid(pte2)) continue; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pa: 0x%x, h: %d, w:" " %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte2++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } sx_sunlock(&allproc_lock); return (npte2); } #endif #ifdef DDB static pt2_entry_t * pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (!pte1_is_link(pte1)) return (NULL); if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR3); } #ifdef SMP else if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR3); } #endif return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } static void dump_pmap(pmap_t pmap) { printf("pmap %p\n", pmap); printf(" pm_pt1: %p\n", pmap->pm_pt1); printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); } DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) { pmap_t pmap; LIST_FOREACH(pmap, &allpmaps, pm_list) { dump_pmap(pmap); } } static int pte2_class(pt2_entry_t pte2) { int cls; cls = (pte2 >> 2) & 0x03; cls |= (pte2 >> 4) & 0x04; return (cls); } static void dump_section(pmap_t pmap, uint32_t pte1_idx) { } static void dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) { uint32_t i; vm_offset_t va; pt2_entry_t *pte2p, pte2; vm_page_t m; va = pte1_idx << PTE1_SHIFT; pte2p = pmap_pte2_ddb(pmap, va); for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (pte2 == 0) continue; if (!pte2_is_valid(pte2)) { printf(" 0x%08X: 0x%08X", va, pte2); if (!invalid_ok) printf(" - not valid !!!"); printf("\n"); continue; } m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); if (m != NULL) { printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid, m->hold_count, m->wire_count, m->flags); } else { printf("\n"); } } } static __inline boolean_t is_pv_chunk_space(vm_offset_t va) { if ((((vm_offset_t)pv_chunkbase) <= va) && (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) return (TRUE); return (FALSE); } DB_SHOW_COMMAND(pmap, pmap_pmap_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va, eva; vm_page_t m; uint32_t i; boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; if (have_addr) { pmap_t pm; LIST_FOREACH(pm, &allpmaps, pm_list) if (pm == pmap) break; if (pm == NULL) { printf("given pmap %p is not in allpmaps list\n", pmap); return; } } else pmap = PCPU_GET(curpmap); eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ printf("pmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); for(i = 0; i < NPTE1_IN_PT1; i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (va >= eva) break; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { dump_link_ok = TRUE; invalid_ok = FALSE; pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", va, pte1, pte2, m); if (is_pv_chunk_space(va)) { printf(" - pv_chunk space"); if (dump_pv_chunk) invalid_ok = TRUE; else dump_link_ok = FALSE; } else if (m != NULL) printf(" w:%d w2:%u", m->wire_count, pt2_wirecount_get(m, pte1_index(va))); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO"); else if (pte2_pa(pte1) != pte2_pa(pte2)) printf(" !!! pt2tab entry is DIFFERENT - m: %p", PHYS_TO_VM_PAGE(pte2_pa(pte2))); printf("\n"); if (dump_link_ok) dump_link(pmap, i, invalid_ok); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } } static void dump_pt2tab(pmap_t pmap) { uint32_t i; pt2_entry_t pte2; vm_offset_t va; vm_paddr_t pa; vm_page_t m; printf("PT2TAB:\n"); for (i = 0; i < PT2TAB_ENTRIES; i++) { pte2 = pte2_load(&pmap->pm_pt2tab[i]); if (!pte2_is_valid(pte2)) continue; va = i << PT2TAB_SHIFT; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, pte2_class(pte2), !!(pte2 & PTE2_S), m); if (m != NULL) printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld", m->hold_count, m->wire_count, m->flags, m->pindex); printf("\n"); } } DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va; uint32_t i, start; if (have_addr) { printf("supported only on current pmap\n"); return; } pmap = PCPU_GET(curpmap); printf("curpmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); start = pte1_index((vm_offset_t)PT2MAP); for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, !!(pte1 & PTE1_S)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, pte1, pte2); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO\n"); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } dump_pt2tab(pmap); } #endif Index: stable/11/sys/arm/include/counter.h =================================================================== --- stable/11/sys/arm/include/counter.h (revision 328385) +++ stable/11/sys/arm/include/counter.h (revision 328386) @@ -1,87 +1,87 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #include #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (atomic_load_64((uint64_t *)((char *)p + sizeof(struct pcpu) * cpu))); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { atomic_store_64((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid)), 0); } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) counter_u64_add(c, inc) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { atomic_add_64((uint64_t *)zpcpu_get(c), inc); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: stable/11/sys/arm64/include/counter.h =================================================================== --- stable/11/sys/arm64/include/counter.h (revision 328385) +++ stable/11/sys/arm64/include/counter.h (revision 328386) @@ -1,85 +1,85 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_COUNTER_H_ #define _MACHINE_COUNTER_H_ #include #include #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) counter_u64_add(c, inc) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { atomic_add_64((uint64_t *)zpcpu_get(c), inc); } #endif /* ! _MACHINE_COUNTER_H_ */ Index: stable/11/sys/cddl/dev/dtrace/aarch64/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/aarch64/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/aarch64/dtrace_subr.c (revision 328386) @@ -1,310 +1,310 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr, *prev; hdlr = dtrace_invop_hdlr; prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, (uintptr_t)VM_MIN_KERNEL_ADDRESS); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See arm64/arm64/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in it's per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { case EXCP_DATA_ABORT: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_elr += 4; return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { int data, invop, reg, update_sp; register_t arg1, arg2; register_t *sp; int offs; int tmp; int i; invop = dtrace_invop(frame->tf_elr, frame, frame->tf_elr); tmp = (invop & LDP_STP_MASK); if (tmp == STP_64 || tmp == LDP_64) { sp = (register_t *)frame->tf_sp; data = invop; arg1 = (data >> ARG1_SHIFT) & ARG1_MASK; arg2 = (data >> ARG2_SHIFT) & ARG2_MASK; offs = (data >> OFFSET_SHIFT) & OFFSET_MASK; switch (tmp) { case STP_64: if (offs >> (OFFSET_SIZE - 1)) sp -= (~offs & OFFSET_MASK) + 1; else sp += (offs); *(sp + 0) = frame->tf_x[arg1]; *(sp + 1) = frame->tf_x[arg2]; break; case LDP_64: frame->tf_x[arg1] = *(sp + 0); frame->tf_x[arg2] = *(sp + 1); if (offs >> (OFFSET_SIZE - 1)) sp -= (~offs & OFFSET_MASK) + 1; else sp += (offs); break; default: break; } /* Update the stack pointer and program counter to continue */ frame->tf_sp = (register_t)sp; frame->tf_elr += INSN_SIZE; return (0); } if ((invop & B_MASK) == B_INSTR) { data = (invop & B_DATA_MASK); /* The data is the number of 4-byte words to change the pc */ data *= 4; frame->tf_elr += data; return (0); } if (invop == RET_INSTR) { frame->tf_elr = frame->tf_lr; return (0); } return (-1); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: stable/11/sys/cddl/dev/dtrace/amd64/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/amd64/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/amd64/dtrace_subr.c (revision 328386) @@ -1,445 +1,445 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, (uintptr_t) addr_PTmap); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } #ifdef notyet void dtrace_safe_synchronous_signal(void) { kthread_t *t = curthread; struct regs *rp = lwptoregs(ttolwp(t)); size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; ASSERT(t->t_dtrace_on); /* * If we're not in the range of scratch addresses, we're not actually * tracing user instructions so turn off the flags. If the instruction * we copied out caused a synchonous trap, reset the pc back to its * original value and turn off the flags. */ if (rp->r_pc < t->t_dtrace_scrpc || rp->r_pc > t->t_dtrace_astpc + isz) { t->t_dtrace_ft = 0; } else if (rp->r_pc == t->t_dtrace_scrpc || rp->r_pc == t->t_dtrace_astpc) { rp->r_pc = t->t_dtrace_pc; t->t_dtrace_ft = 0; } } int dtrace_safe_defer_signal(void) { kthread_t *t = curthread; struct regs *rp = lwptoregs(ttolwp(t)); size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; ASSERT(t->t_dtrace_on); /* * If we're not in the range of scratch addresses, we're not actually * tracing user instructions so turn off the flags. */ if (rp->r_pc < t->t_dtrace_scrpc || rp->r_pc > t->t_dtrace_astpc + isz) { t->t_dtrace_ft = 0; return (0); } /* * If we have executed the original instruction, but we have performed * neither the jmp back to t->t_dtrace_npc nor the clean up of any * registers used to emulate %rip-relative instructions in 64-bit mode, * we'll save ourselves some effort by doing that here and taking the * signal right away. We detect this condition by seeing if the program * counter is the range [scrpc + isz, astpc). */ if (rp->r_pc >= t->t_dtrace_scrpc + isz && rp->r_pc < t->t_dtrace_astpc) { #ifdef __amd64 /* * If there is a scratch register and we're on the * instruction immediately after the modified instruction, * restore the value of that scratch register. */ if (t->t_dtrace_reg != 0 && rp->r_pc == t->t_dtrace_scrpc + isz) { switch (t->t_dtrace_reg) { case REG_RAX: rp->r_rax = t->t_dtrace_regv; break; case REG_RCX: rp->r_rcx = t->t_dtrace_regv; break; case REG_R8: rp->r_r8 = t->t_dtrace_regv; break; case REG_R9: rp->r_r9 = t->t_dtrace_regv; break; } } #endif rp->r_pc = t->t_dtrace_npc; t->t_dtrace_ft = 0; return (0); } /* * Otherwise, make sure we'll return to the kernel after executing * the copied out instruction and defer the signal. */ if (!t->t_dtrace_step) { ASSERT(rp->r_pc < t->t_dtrace_astpc); rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; t->t_dtrace_step = 1; } t->t_dtrace_ast = 1; return (1); } #endif static int64_t tgt_cpu_tsc; static int64_t hst_cpu_tsc; static int64_t tsc_skew[MAXCPU]; static uint64_t nsec_scale; /* See below for the explanation of this macro. */ #define SCALE_SHIFT 28 static void dtrace_gethrtime_init_cpu(void *arg) { uintptr_t cpu = (uintptr_t) arg; if (cpu == curcpu) tgt_cpu_tsc = rdtsc(); else hst_cpu_tsc = rdtsc(); } #ifdef EARLY_AP_STARTUP static void dtrace_gethrtime_init(void *arg) { struct pcpu *pc; uint64_t tsc_f; cpuset_t map; int i; #else /* * Get the frequency and scale factor as early as possible so that they can be * used for boot-time tracing. */ static void dtrace_gethrtime_init_early(void *arg) { uint64_t tsc_f; #endif /* * Get TSC frequency known at this moment. * This should be constant if TSC is invariant. * Otherwise tick->time conversion will be inaccurate, but * will preserve monotonic property of TSC. */ tsc_f = atomic_load_acq_64(&tsc_freq); /* * The following line checks that nsec_scale calculated below * doesn't overflow 32-bit unsigned integer, so that it can multiply * another 32-bit integer without overflowing 64-bit. * Thus minimum supported TSC frequency is 62.5MHz. */ KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); /* * We scale up NANOSEC/tsc_f ratio to preserve as much precision * as possible. * 2^28 factor was chosen quite arbitrarily from practical * considerations: * - it supports TSC frequencies as low as 62.5MHz (see above); * - it provides quite good precision (e < 0.01%) up to THz * (terahertz) values; */ nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f; #ifndef EARLY_AP_STARTUP } SYSINIT(dtrace_gethrtime_init_early, SI_SUB_CPU, SI_ORDER_ANY, dtrace_gethrtime_init_early, NULL); static void dtrace_gethrtime_init(void *arg) { struct pcpu *pc; cpuset_t map; int i; #endif /* The current CPU is the reference one. */ sched_pin(); tsc_skew[curcpu] = 0; CPU_FOREACH(i) { if (i == curcpu) continue; pc = pcpu_find(i); CPU_SETOF(PCPU_GET(cpuid), &map); CPU_SET(pc->pc_cpuid, &map); smp_rendezvous_cpus(map, NULL, dtrace_gethrtime_init_cpu, - smp_no_rendevous_barrier, (void *)(uintptr_t) i); + smp_no_rendezvous_barrier, (void *)(uintptr_t) i); tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; } sched_unpin(); } #ifdef EARLY_AP_STARTUP SYSINIT(dtrace_gethrtime_init, SI_SUB_DTRACE, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); #else SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); #endif /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime(void) { uint64_t tsc; uint32_t lo, hi; register_t rflags; /* * We split TSC value into lower and higher 32-bit halves and separately * scale them with nsec_scale, then we scale them down by 2^28 * (see nsec_scale calculations) taking into account 32-bit shift of * the higher half and finally add. */ rflags = intr_disable(); tsc = rdtsc() - tsc_skew[curcpu]; intr_restore(rflags); lo = tsc; hi = tsc >> 32; return (((lo * nsec_scale) >> SCALE_SHIFT) + ((hi * nsec_scale) << (32 - SCALE_SHIFT))); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c. */ int dtrace_trap(struct trapframe *frame, u_int type) { uint16_t nofault; /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: */ sched_pin(); nofault = cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT; sched_unpin(); if (nofault) { KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* General protection fault. */ case T_PROTFLT: /* Flag an illegal operation. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); return (1); /* Page fault. */ case T_PAGEFLT: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = frame->tf_addr; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_rip += dtrace_instr_size((u_char *) frame->tf_rip); return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } Index: stable/11/sys/cddl/dev/dtrace/arm/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/arm/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/arm/dtrace_subr.c (revision 328386) @@ -1,345 +1,345 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DELAYBRANCH(x) ((int)(x) < 0) #define BIT_PC 15 #define BIT_LR 14 #define BIT_SP 13 extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { printf("IMPLEMENT ME: dtrace_toxic_ranges\n"); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in it's per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* Page fault. */ case FAULT_ALIGN: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_pc += sizeof(int); return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { register_t *r0, *sp; int data, invop, reg, update_sp; invop = dtrace_invop(frame->tf_pc, frame, frame->tf_pc); switch (invop & DTRACE_INVOP_MASK) { case DTRACE_INVOP_PUSHM: sp = (register_t *)frame->tf_svc_sp; r0 = &frame->tf_r0; data = DTRACE_INVOP_DATA(invop); /* * Store the pc, lr, and sp. These have their own * entries in the struct. */ if (data & (1 << BIT_PC)) { sp--; *sp = frame->tf_pc; } if (data & (1 << BIT_LR)) { sp--; *sp = frame->tf_svc_lr; } if (data & (1 << BIT_SP)) { sp--; *sp = frame->tf_svc_sp; } /* Store the general registers */ for (reg = 12; reg >= 0; reg--) { if (data & (1 << reg)) { sp--; *sp = r0[reg]; } } /* Update the stack pointer and program counter to continue */ frame->tf_svc_sp = (register_t)sp; frame->tf_pc += 4; break; case DTRACE_INVOP_POPM: sp = (register_t *)frame->tf_svc_sp; r0 = &frame->tf_r0; data = DTRACE_INVOP_DATA(invop); /* Read the general registers */ for (reg = 0; reg <= 12; reg++) { if (data & (1 << reg)) { r0[reg] = *sp; sp++; } } /* * Set the stack pointer. If we don't update it here we will * need to update it at the end as the instruction would do */ update_sp = 1; if (data & (1 << BIT_SP)) { frame->tf_svc_sp = *sp; *sp++; update_sp = 0; } /* Update the link register, we need to use the correct copy */ if (data & (1 << BIT_LR)) { frame->tf_svc_lr = *sp; *sp++; } /* * And the program counter. If it's not in the list skip over * it when we return so to not hit this again. */ if (data & (1 << BIT_PC)) { frame->tf_pc = *sp; *sp++; } else frame->tf_pc += 4; /* Update the stack pointer if we haven't already done so */ if (update_sp) frame->tf_svc_sp = (register_t)sp; break; case DTRACE_INVOP_B: data = DTRACE_INVOP_DATA(invop) & 0x00ffffff; /* Sign extend the data */ if ((data & (1 << 23)) != 0) data |= 0xff000000; /* The data is the number of 4-byte words to change the pc */ data *= 4; data += 8; frame->tf_pc += data; break; default: return (-1); break; } return (0); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: stable/11/sys/cddl/dev/dtrace/i386/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/i386/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/i386/dtrace_subr.c (revision 328386) @@ -1,447 +1,447 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern uintptr_t kernelbase; extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, kernelbase); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } #ifdef notyet void dtrace_safe_synchronous_signal(void) { kthread_t *t = curthread; struct regs *rp = lwptoregs(ttolwp(t)); size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; ASSERT(t->t_dtrace_on); /* * If we're not in the range of scratch addresses, we're not actually * tracing user instructions so turn off the flags. If the instruction * we copied out caused a synchonous trap, reset the pc back to its * original value and turn off the flags. */ if (rp->r_pc < t->t_dtrace_scrpc || rp->r_pc > t->t_dtrace_astpc + isz) { t->t_dtrace_ft = 0; } else if (rp->r_pc == t->t_dtrace_scrpc || rp->r_pc == t->t_dtrace_astpc) { rp->r_pc = t->t_dtrace_pc; t->t_dtrace_ft = 0; } } int dtrace_safe_defer_signal(void) { kthread_t *t = curthread; struct regs *rp = lwptoregs(ttolwp(t)); size_t isz = t->t_dtrace_npc - t->t_dtrace_pc; ASSERT(t->t_dtrace_on); /* * If we're not in the range of scratch addresses, we're not actually * tracing user instructions so turn off the flags. */ if (rp->r_pc < t->t_dtrace_scrpc || rp->r_pc > t->t_dtrace_astpc + isz) { t->t_dtrace_ft = 0; return (0); } /* * If we have executed the original instruction, but we have performed * neither the jmp back to t->t_dtrace_npc nor the clean up of any * registers used to emulate %rip-relative instructions in 64-bit mode, * we'll save ourselves some effort by doing that here and taking the * signal right away. We detect this condition by seeing if the program * counter is the range [scrpc + isz, astpc). */ if (rp->r_pc >= t->t_dtrace_scrpc + isz && rp->r_pc < t->t_dtrace_astpc) { #ifdef __amd64 /* * If there is a scratch register and we're on the * instruction immediately after the modified instruction, * restore the value of that scratch register. */ if (t->t_dtrace_reg != 0 && rp->r_pc == t->t_dtrace_scrpc + isz) { switch (t->t_dtrace_reg) { case REG_RAX: rp->r_rax = t->t_dtrace_regv; break; case REG_RCX: rp->r_rcx = t->t_dtrace_regv; break; case REG_R8: rp->r_r8 = t->t_dtrace_regv; break; case REG_R9: rp->r_r9 = t->t_dtrace_regv; break; } } #endif rp->r_pc = t->t_dtrace_npc; t->t_dtrace_ft = 0; return (0); } /* * Otherwise, make sure we'll return to the kernel after executing * the copied out instruction and defer the signal. */ if (!t->t_dtrace_step) { ASSERT(rp->r_pc < t->t_dtrace_astpc); rp->r_pc += t->t_dtrace_astpc - t->t_dtrace_scrpc; t->t_dtrace_step = 1; } t->t_dtrace_ast = 1; return (1); } #endif static int64_t tgt_cpu_tsc; static int64_t hst_cpu_tsc; static int64_t tsc_skew[MAXCPU]; static uint64_t nsec_scale; /* See below for the explanation of this macro. */ #define SCALE_SHIFT 28 static void dtrace_gethrtime_init_cpu(void *arg) { uintptr_t cpu = (uintptr_t) arg; if (cpu == curcpu) tgt_cpu_tsc = rdtsc(); else hst_cpu_tsc = rdtsc(); } #ifdef EARLY_AP_STARTUP static void dtrace_gethrtime_init(void *arg) { struct pcpu *pc; uint64_t tsc_f; cpuset_t map; int i; #else /* * Get the frequency and scale factor as early as possible so that they can be * used for boot-time tracing. */ static void dtrace_gethrtime_init_early(void *arg) { uint64_t tsc_f; #endif /* * Get TSC frequency known at this moment. * This should be constant if TSC is invariant. * Otherwise tick->time conversion will be inaccurate, but * will preserve monotonic property of TSC. */ tsc_f = atomic_load_acq_64(&tsc_freq); /* * The following line checks that nsec_scale calculated below * doesn't overflow 32-bit unsigned integer, so that it can multiply * another 32-bit integer without overflowing 64-bit. * Thus minimum supported TSC frequency is 62.5MHz. */ KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); /* * We scale up NANOSEC/tsc_f ratio to preserve as much precision * as possible. * 2^28 factor was chosen quite arbitrarily from practical * considerations: * - it supports TSC frequencies as low as 62.5MHz (see above); * - it provides quite good precision (e < 0.01%) up to THz * (terahertz) values; */ nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f; #ifndef EARLY_AP_STARTUP } SYSINIT(dtrace_gethrtime_init_early, SI_SUB_CPU, SI_ORDER_ANY, dtrace_gethrtime_init_early, NULL); static void dtrace_gethrtime_init(void *arg) { cpuset_t map; struct pcpu *pc; int i; #endif /* The current CPU is the reference one. */ sched_pin(); tsc_skew[curcpu] = 0; CPU_FOREACH(i) { if (i == curcpu) continue; pc = pcpu_find(i); CPU_SETOF(PCPU_GET(cpuid), &map); CPU_SET(pc->pc_cpuid, &map); smp_rendezvous_cpus(map, NULL, dtrace_gethrtime_init_cpu, - smp_no_rendevous_barrier, (void *)(uintptr_t) i); + smp_no_rendezvous_barrier, (void *)(uintptr_t) i); tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; } sched_unpin(); } #ifdef EARLY_AP_STARTUP SYSINIT(dtrace_gethrtime_init, SI_SUB_DTRACE, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); #else SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); #endif /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime(void) { uint64_t tsc; uint32_t lo, hi; register_t eflags; /* * We split TSC value into lower and higher 32-bit halves and separately * scale them with nsec_scale, then we scale them down by 2^28 * (see nsec_scale calculations) taking into account 32-bit shift of * the higher half and finally add. */ eflags = intr_disable(); tsc = rdtsc() - tsc_skew[curcpu]; intr_restore(eflags); lo = tsc; hi = tsc >> 32; return (((lo * nsec_scale) >> SCALE_SHIFT) + ((hi * nsec_scale) << (32 - SCALE_SHIFT))); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See i386/i386/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { uint16_t nofault; /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: */ sched_pin(); nofault = cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT; sched_unpin(); if (nofault) { KASSERT((read_eflags() & PSL_I) == 0, ("interrupts enabled")); /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* General protection fault. */ case T_PROTFLT: /* Flag an illegal operation. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip); return (1); /* Page fault. */ case T_PAGEFLT: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = rcr2(); /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_eip += dtrace_instr_size((u_char *) frame->tf_eip); return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } Index: stable/11/sys/cddl/dev/dtrace/mips/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/mips/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/mips/dtrace_subr.c (revision 328386) @@ -1,284 +1,284 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DELAYBRANCH(x) ((int)(x) < 0) extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern dtrace_id_t dtrace_probeid_error; int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *stack, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, stack, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr, *prev; hdlr = dtrace_invop_hdlr; prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { /* * No toxic regions? */ } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec curtime; getnanotime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* Page fault. */ case T_TLB_ST_MISS: case T_ADDR_ERR_ST: case T_TLB_LD_MISS: case T_ADDR_ERR_LD: case T_BUS_ERR_IFETCH: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = frame->badvaddr; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ if (DELAYBRANCH(frame->cause)) /* Check BD bit */ { /* XXX: check MipsEmulateBranch on MIPS64 frame->pc = MipsEmulateBranch(frame, frame->pc, 0, 0); */ panic("%s: delay slot at %jx, badvaddr = %jx\n", __func__, (intmax_t)frame->pc, (intmax_t)frame->badvaddr); } else frame->pc += sizeof(int); return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { register_t *sp; int16_t offs; int invop; invop = dtrace_invop(frame->pc, frame, frame->pc); offs = (invop & LDSD_DATA_MASK); sp = (register_t *)((uint8_t *)frame->sp + offs); switch (invop & LDSD_RA_SP_MASK) { case LD_RA_SP: frame->ra = *sp; frame->pc += INSN_SIZE; break; case SD_RA_SP: *(sp) = frame->ra; frame->pc += INSN_SIZE; break; default: printf("%s: 0x%x undefined\n", __func__, invop); return (-1); }; return (0); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: stable/11/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/powerpc/dtrace_subr.c (revision 328386) @@ -1,362 +1,362 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DELAYBRANCH(x) ((int)(x) < 0) extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t arg0) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, arg0)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { /* * No toxic regions? */ } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } static int64_t tgt_cpu_tsc; static int64_t hst_cpu_tsc; static int64_t timebase_skew[MAXCPU]; static uint64_t nsec_scale; /* See below for the explanation of this macro. */ /* This is taken from the amd64 dtrace_subr, to provide a synchronized timer * between multiple processors in dtrace. Since PowerPC Timebases can be much * lower than x86, the scale shift is 26 instead of 28, allowing for a 15.63MHz * timebase. */ #define SCALE_SHIFT 26 static void dtrace_gethrtime_init_cpu(void *arg) { uintptr_t cpu = (uintptr_t) arg; if (cpu == curcpu) tgt_cpu_tsc = mftb(); else hst_cpu_tsc = mftb(); } static void dtrace_gethrtime_init(void *arg) { struct pcpu *pc; uint64_t tb_f; cpuset_t map; int i; tb_f = cpu_tickrate(); /* * The following line checks that nsec_scale calculated below * doesn't overflow 32-bit unsigned integer, so that it can multiply * another 32-bit integer without overflowing 64-bit. * Thus minimum supported Timebase frequency is 15.63MHz. */ KASSERT(tb_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("Timebase frequency is too low")); /* * We scale up NANOSEC/tb_f ratio to preserve as much precision * as possible. * 2^26 factor was chosen quite arbitrarily from practical * considerations: * - it supports TSC frequencies as low as 15.63MHz (see above); */ nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tb_f; /* The current CPU is the reference one. */ sched_pin(); timebase_skew[curcpu] = 0; CPU_FOREACH(i) { if (i == curcpu) continue; pc = pcpu_find(i); CPU_SETOF(PCPU_GET(cpuid), &map); CPU_SET(pc->pc_cpuid, &map); smp_rendezvous_cpus(map, NULL, dtrace_gethrtime_init_cpu, - smp_no_rendevous_barrier, (void *)(uintptr_t) i); + smp_no_rendezvous_barrier, (void *)(uintptr_t) i); timebase_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; } sched_unpin(); } #ifdef EARLY_AP_STARTUP SYSINIT(dtrace_gethrtime_init, SI_SUB_DTRACE, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); #else SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL); #endif /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { uint64_t timebase; uint32_t lo; uint32_t hi; /* * We split timebase value into lower and higher 32-bit halves and separately * scale them with nsec_scale, then we scale them down by 2^28 * (see nsec_scale calculations) taking into account 32-bit shift of * the higher half and finally add. */ timebase = mftb() - timebase_skew[curcpu]; lo = timebase; hi = timebase >> 32; return (((lo * nsec_scale) >> SCALE_SHIFT) + ((hi * nsec_scale) << (32 - SCALE_SHIFT))); } uint64_t dtrace_gethrestime(void) { struct timespec curtime; dtrace_getnanotime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } /* Function to handle DTrace traps during probes. See powerpc/powerpc/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* Page fault. */ case EXC_DSI: case EXC_DSE: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = frame->dar; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->srr0 += sizeof(int); return (1); case EXC_ISI: case EXC_ISE: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = frame->srr0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->srr0 += sizeof(int); return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { switch (dtrace_invop(frame->srr0, frame, frame->fixreg[3])) { case DTRACE_INVOP_JUMP: break; case DTRACE_INVOP_BCTR: frame->srr0 = frame->ctr; break; case DTRACE_INVOP_BLR: frame->srr0 = frame->lr; break; case DTRACE_INVOP_MFLR_R0: frame->fixreg[0] = frame->lr; frame->srr0 = frame->srr0 + 4; break; default: return (-1); } return (0); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: stable/11/sys/cddl/dev/dtrace/riscv/dtrace_subr.c =================================================================== --- stable/11/sys/cddl/dev/dtrace/riscv/dtrace_subr.c (revision 328385) +++ stable/11/sys/cddl/dev/dtrace/riscv/dtrace_subr.c (revision 328386) @@ -1,282 +1,282 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Portions Copyright 2016 Ruslan Bukin * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr, *prev; hdlr = dtrace_invop_hdlr; prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, (uintptr_t)VM_MIN_KERNEL_ADDRESS); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); - smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, - smp_no_rendevous_barrier, arg); + smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See riscv/riscv/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in it's per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { case EXCP_LOAD_ACCESS_FAULT: case EXCP_STORE_ACCESS_FAULT: case EXCP_INSTR_ACCESS_FAULT: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_sepc += 4; return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { int data, invop, reg, update_sp; register_t arg1, arg2; register_t *sp; uint32_t imm; InstFmt i; int offs; int tmp; invop = dtrace_invop(frame->tf_sepc, frame, frame->tf_sepc); if (invop == RISCV_INSN_RET) { frame->tf_sepc = frame->tf_ra; return (0); } if ((invop & SD_RA_SP_MASK) == SD_RA_SP) { i.word = invop; imm = i.SType.imm0_4 | (i.SType.imm5_11 << 5); sp = (register_t *)((uint8_t *)frame->tf_sp + imm); *sp = frame->tf_ra; frame->tf_sepc += INSN_SIZE; return (0); } return (-1); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: stable/11/sys/dev/acpica/acpi_cpu.c =================================================================== --- stable/11/sys/dev/acpica/acpi_cpu.c (revision 328385) +++ stable/11/sys/dev/acpica/acpi_cpu.c (revision 328386) @@ -1,1526 +1,1526 @@ /*- * Copyright (c) 2003-2005 Nate Lawson (SDG) * Copyright (c) 2001 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) #include #include #include #endif #include #include #include #include /* * Support for ACPI Processor devices, including C[1-3] sleep states. */ /* Hooks for the ACPI CA debugging infrastructure */ #define _COMPONENT ACPI_PROCESSOR ACPI_MODULE_NAME("PROCESSOR") struct acpi_cx { struct resource *p_lvlx; /* Register to read to enter state. */ uint32_t type; /* C1-3 (C4 and up treated as C3). */ uint32_t trans_lat; /* Transition latency (usec). */ uint32_t power; /* Power consumed (mW). */ int res_type; /* Resource type for p_lvlx. */ int res_rid; /* Resource ID for p_lvlx. */ bool do_mwait; uint32_t mwait_hint; bool mwait_hw_coord; bool mwait_bm_avoidance; }; #define MAX_CX_STATES 8 struct acpi_cpu_softc { device_t cpu_dev; ACPI_HANDLE cpu_handle; struct pcpu *cpu_pcpu; uint32_t cpu_acpi_id; /* ACPI processor id */ uint32_t cpu_p_blk; /* ACPI P_BLK location */ uint32_t cpu_p_blk_len; /* P_BLK length (must be 6). */ struct acpi_cx cpu_cx_states[MAX_CX_STATES]; int cpu_cx_count; /* Number of valid Cx states. */ int cpu_prev_sleep;/* Last idle sleep duration. */ int cpu_features; /* Child driver supported features. */ /* Runtime state. */ int cpu_non_c2; /* Index of lowest non-C2 state. */ int cpu_non_c3; /* Index of lowest non-C3 state. */ u_int cpu_cx_stats[MAX_CX_STATES];/* Cx usage history. */ /* Values for sysctl. */ struct sysctl_ctx_list cpu_sysctl_ctx; struct sysctl_oid *cpu_sysctl_tree; int cpu_cx_lowest; int cpu_cx_lowest_lim; int cpu_disable_idle; /* Disable entry to idle function */ char cpu_cx_supported[64]; }; struct acpi_cpu_device { struct resource_list ad_rl; }; #define CPU_GET_REG(reg, width) \ (bus_space_read_ ## width(rman_get_bustag((reg)), \ rman_get_bushandle((reg)), 0)) #define CPU_SET_REG(reg, width, val) \ (bus_space_write_ ## width(rman_get_bustag((reg)), \ rman_get_bushandle((reg)), 0, (val))) #define PM_USEC(x) ((x) >> 2) /* ~4 clocks per usec (3.57955 Mhz) */ #define ACPI_NOTIFY_CX_STATES 0x81 /* _CST changed. */ #define CPU_QUIRK_NO_C3 (1<<0) /* C3-type states are not usable. */ #define CPU_QUIRK_NO_BM_CTRL (1<<2) /* No bus mastering control. */ #define PCI_VENDOR_INTEL 0x8086 #define PCI_DEVICE_82371AB_3 0x7113 /* PIIX4 chipset for quirks. */ #define PCI_REVISION_A_STEP 0 #define PCI_REVISION_B_STEP 1 #define PCI_REVISION_4E 2 #define PCI_REVISION_4M 3 #define PIIX4_DEVACTB_REG 0x58 #define PIIX4_BRLD_EN_IRQ0 (1<<0) #define PIIX4_BRLD_EN_IRQ (1<<1) #define PIIX4_BRLD_EN_IRQ8 (1<<5) #define PIIX4_STOP_BREAK_MASK (PIIX4_BRLD_EN_IRQ0 | PIIX4_BRLD_EN_IRQ | PIIX4_BRLD_EN_IRQ8) #define PIIX4_PCNTRL_BST_EN (1<<10) #define CST_FFH_VENDOR_INTEL 1 #define CST_FFH_INTEL_CL_C1IO 1 #define CST_FFH_INTEL_CL_MWAIT 2 #define CST_FFH_MWAIT_HW_COORD 0x0001 #define CST_FFH_MWAIT_BM_AVOID 0x0002 /* Allow users to ignore processor orders in MADT. */ static int cpu_unordered; SYSCTL_INT(_debug_acpi, OID_AUTO, cpu_unordered, CTLFLAG_RDTUN, &cpu_unordered, 0, "Do not use the MADT to match ACPI Processor objects to CPUs."); /* Knob to disable acpi_cpu devices */ bool acpi_cpu_disabled = false; /* Platform hardware resource information. */ static uint32_t cpu_smi_cmd; /* Value to write to SMI_CMD. */ static uint8_t cpu_cst_cnt; /* Indicate we are _CST aware. */ static int cpu_quirks; /* Indicate any hardware bugs. */ /* Values for sysctl. */ static struct sysctl_ctx_list cpu_sysctl_ctx; static struct sysctl_oid *cpu_sysctl_tree; static int cpu_cx_generic; static int cpu_cx_lowest_lim; static device_t *cpu_devices; static int cpu_ndevices; static struct acpi_cpu_softc **cpu_softc; ACPI_SERIAL_DECL(cpu, "ACPI CPU"); static int acpi_cpu_probe(device_t dev); static int acpi_cpu_attach(device_t dev); static int acpi_cpu_suspend(device_t dev); static int acpi_cpu_resume(device_t dev); static int acpi_pcpu_get_id(device_t dev, uint32_t *acpi_id, uint32_t *cpu_id); static struct resource_list *acpi_cpu_get_rlist(device_t dev, device_t child); static device_t acpi_cpu_add_child(device_t dev, u_int order, const char *name, int unit); static int acpi_cpu_read_ivar(device_t dev, device_t child, int index, uintptr_t *result); static int acpi_cpu_shutdown(device_t dev); static void acpi_cpu_cx_probe(struct acpi_cpu_softc *sc); static void acpi_cpu_generic_cx_probe(struct acpi_cpu_softc *sc); static int acpi_cpu_cx_cst(struct acpi_cpu_softc *sc); static void acpi_cpu_startup(void *arg); static void acpi_cpu_startup_cx(struct acpi_cpu_softc *sc); static void acpi_cpu_cx_list(struct acpi_cpu_softc *sc); #if defined(__i386__) || defined(__amd64__) static void acpi_cpu_idle(sbintime_t sbt); #endif static void acpi_cpu_notify(ACPI_HANDLE h, UINT32 notify, void *context); static void acpi_cpu_quirks(void); static void acpi_cpu_quirks_piix4(void); static int acpi_cpu_usage_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_cpu_usage_counters_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_cpu_set_cx_lowest(struct acpi_cpu_softc *sc); static int acpi_cpu_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_cpu_global_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS); #if defined(__i386__) || defined(__amd64__) static int acpi_cpu_method_sysctl(SYSCTL_HANDLER_ARGS); #endif static device_method_t acpi_cpu_methods[] = { /* Device interface */ DEVMETHOD(device_probe, acpi_cpu_probe), DEVMETHOD(device_attach, acpi_cpu_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, acpi_cpu_shutdown), DEVMETHOD(device_suspend, acpi_cpu_suspend), DEVMETHOD(device_resume, acpi_cpu_resume), /* Bus interface */ DEVMETHOD(bus_add_child, acpi_cpu_add_child), DEVMETHOD(bus_read_ivar, acpi_cpu_read_ivar), DEVMETHOD(bus_get_resource_list, acpi_cpu_get_rlist), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_alloc_resource, bus_generic_rl_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_rl_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD_END }; static driver_t acpi_cpu_driver = { "cpu", acpi_cpu_methods, sizeof(struct acpi_cpu_softc), }; static devclass_t acpi_cpu_devclass; DRIVER_MODULE(cpu, acpi, acpi_cpu_driver, acpi_cpu_devclass, 0, 0); MODULE_DEPEND(cpu, acpi, 1, 1, 1); static int acpi_cpu_probe(device_t dev) { int acpi_id, cpu_id; ACPI_BUFFER buf; ACPI_HANDLE handle; ACPI_OBJECT *obj; ACPI_STATUS status; if (acpi_disabled("cpu") || acpi_get_type(dev) != ACPI_TYPE_PROCESSOR || acpi_cpu_disabled) return (ENXIO); handle = acpi_get_handle(dev); if (cpu_softc == NULL) cpu_softc = malloc(sizeof(struct acpi_cpu_softc *) * (mp_maxid + 1), M_TEMP /* XXX */, M_WAITOK | M_ZERO); /* Get our Processor object. */ buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(handle, NULL, NULL, &buf); if (ACPI_FAILURE(status)) { device_printf(dev, "probe failed to get Processor obj - %s\n", AcpiFormatException(status)); return (ENXIO); } obj = (ACPI_OBJECT *)buf.Pointer; if (obj->Type != ACPI_TYPE_PROCESSOR) { device_printf(dev, "Processor object has bad type %d\n", obj->Type); AcpiOsFree(obj); return (ENXIO); } /* * Find the processor associated with our unit. We could use the * ProcId as a key, however, some boxes do not have the same values * in their Processor object as the ProcId values in the MADT. */ acpi_id = obj->Processor.ProcId; AcpiOsFree(obj); if (acpi_pcpu_get_id(dev, &acpi_id, &cpu_id) != 0) return (ENXIO); /* * Check if we already probed this processor. We scan the bus twice * so it's possible we've already seen this one. */ if (cpu_softc[cpu_id] != NULL) return (ENXIO); /* Mark this processor as in-use and save our derived id for attach. */ cpu_softc[cpu_id] = (void *)1; acpi_set_private(dev, (void*)(intptr_t)cpu_id); device_set_desc(dev, "ACPI CPU"); return (0); } static int acpi_cpu_attach(device_t dev) { ACPI_BUFFER buf; ACPI_OBJECT arg, *obj; ACPI_OBJECT_LIST arglist; struct pcpu *pcpu_data; struct acpi_cpu_softc *sc; struct acpi_softc *acpi_sc; ACPI_STATUS status; u_int features; int cpu_id, drv_count, i; driver_t **drivers; uint32_t cap_set[3]; /* UUID needed by _OSC evaluation */ static uint8_t cpu_oscuuid[16] = { 0x16, 0xA6, 0x77, 0x40, 0x0C, 0x29, 0xBE, 0x47, 0x9E, 0xBD, 0xD8, 0x70, 0x58, 0x71, 0x39, 0x53 }; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = device_get_softc(dev); sc->cpu_dev = dev; sc->cpu_handle = acpi_get_handle(dev); cpu_id = (int)(intptr_t)acpi_get_private(dev); cpu_softc[cpu_id] = sc; pcpu_data = pcpu_find(cpu_id); pcpu_data->pc_device = dev; sc->cpu_pcpu = pcpu_data; cpu_smi_cmd = AcpiGbl_FADT.SmiCommand; cpu_cst_cnt = AcpiGbl_FADT.CstControl; buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->cpu_handle, NULL, NULL, &buf); if (ACPI_FAILURE(status)) { device_printf(dev, "attach failed to get Processor obj - %s\n", AcpiFormatException(status)); return (ENXIO); } obj = (ACPI_OBJECT *)buf.Pointer; sc->cpu_p_blk = obj->Processor.PblkAddress; sc->cpu_p_blk_len = obj->Processor.PblkLength; sc->cpu_acpi_id = obj->Processor.ProcId; AcpiOsFree(obj); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: P_BLK at %#x/%d\n", device_get_unit(dev), sc->cpu_p_blk, sc->cpu_p_blk_len)); /* * If this is the first cpu we attach, create and initialize the generic * resources that will be used by all acpi cpu devices. */ if (device_get_unit(dev) == 0) { /* Assume we won't be using generic Cx mode by default */ cpu_cx_generic = FALSE; /* Install hw.acpi.cpu sysctl tree */ acpi_sc = acpi_device_get_parent_softc(dev); sysctl_ctx_init(&cpu_sysctl_ctx); cpu_sysctl_tree = SYSCTL_ADD_NODE(&cpu_sysctl_ctx, SYSCTL_CHILDREN(acpi_sc->acpi_sysctl_tree), OID_AUTO, "cpu", CTLFLAG_RD, 0, "node for CPU children"); } /* * Before calling any CPU methods, collect child driver feature hints * and notify ACPI of them. We support unified SMP power control * so advertise this ourselves. Note this is not the same as independent * SMP control where each CPU can have different settings. */ sc->cpu_features = ACPI_CAP_SMP_SAME | ACPI_CAP_SMP_SAME_C3 | ACPI_CAP_C1_IO_HALT; #if defined(__i386__) || defined(__amd64__) /* * Ask for MWAIT modes if not disabled and interrupts work * reasonable with MWAIT. */ if (!acpi_disabled("mwait") && cpu_mwait_usable()) sc->cpu_features |= ACPI_CAP_SMP_C1_NATIVE | ACPI_CAP_SMP_C3_NATIVE; #endif if (devclass_get_drivers(acpi_cpu_devclass, &drivers, &drv_count) == 0) { for (i = 0; i < drv_count; i++) { if (ACPI_GET_FEATURES(drivers[i], &features) == 0) sc->cpu_features |= features; } free(drivers, M_TEMP); } /* * CPU capabilities are specified in * Intel Processor Vendor-Specific ACPI Interface Specification. */ if (sc->cpu_features) { cap_set[1] = sc->cpu_features; status = acpi_EvaluateOSC(sc->cpu_handle, cpu_oscuuid, 1, 2, cap_set, cap_set, false); if (ACPI_SUCCESS(status)) { if (cap_set[0] != 0) device_printf(dev, "_OSC returned status %#x\n", cap_set[0]); } else { arglist.Pointer = &arg; arglist.Count = 1; arg.Type = ACPI_TYPE_BUFFER; arg.Buffer.Length = sizeof(cap_set); arg.Buffer.Pointer = (uint8_t *)cap_set; cap_set[0] = 1; /* revision */ cap_set[1] = 1; /* number of capabilities integers */ cap_set[2] = sc->cpu_features; AcpiEvaluateObject(sc->cpu_handle, "_PDC", &arglist, NULL); } } /* Probe for Cx state support. */ acpi_cpu_cx_probe(sc); return (0); } static void acpi_cpu_postattach(void *unused __unused) { device_t *devices; int err; int i, n; int attached; err = devclass_get_devices(acpi_cpu_devclass, &devices, &n); if (err != 0) { printf("devclass_get_devices(acpi_cpu_devclass) failed\n"); return; } attached = 0; for (i = 0; i < n; i++) if (device_is_attached(devices[i]) && device_get_driver(devices[i]) == &acpi_cpu_driver) attached = 1; for (i = 0; i < n; i++) bus_generic_probe(devices[i]); for (i = 0; i < n; i++) bus_generic_attach(devices[i]); free(devices, M_TEMP); if (attached) { #ifdef EARLY_AP_STARTUP acpi_cpu_startup(NULL); #else /* Queue post cpu-probing task handler */ AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cpu_startup, NULL); #endif } } SYSINIT(acpi_cpu, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, acpi_cpu_postattach, NULL); static void disable_idle(struct acpi_cpu_softc *sc) { cpuset_t cpuset; CPU_SETOF(sc->cpu_pcpu->pc_cpuid, &cpuset); sc->cpu_disable_idle = TRUE; /* * Ensure that the CPU is not in idle state or in acpi_cpu_idle(). * Note that this code depends on the fact that the rendezvous IPI * can not penetrate context where interrupts are disabled and acpi_cpu_idle * is called and executed in such a context with interrupts being re-enabled * right before return. */ - smp_rendezvous_cpus(cpuset, smp_no_rendevous_barrier, NULL, - smp_no_rendevous_barrier, NULL); + smp_rendezvous_cpus(cpuset, smp_no_rendezvous_barrier, NULL, + smp_no_rendezvous_barrier, NULL); } static void enable_idle(struct acpi_cpu_softc *sc) { sc->cpu_disable_idle = FALSE; } #if defined(__i386__) || defined(__amd64__) static int is_idle_disabled(struct acpi_cpu_softc *sc) { return (sc->cpu_disable_idle); } #endif /* * Disable any entry to the idle function during suspend and re-enable it * during resume. */ static int acpi_cpu_suspend(device_t dev) { int error; error = bus_generic_suspend(dev); if (error) return (error); disable_idle(device_get_softc(dev)); return (0); } static int acpi_cpu_resume(device_t dev) { enable_idle(device_get_softc(dev)); return (bus_generic_resume(dev)); } /* * Find the processor associated with a given ACPI ID. By default, * use the MADT to map ACPI IDs to APIC IDs and use that to locate a * processor. Some systems have inconsistent ASL and MADT however. * For these systems the cpu_unordered tunable can be set in which * case we assume that Processor objects are listed in the same order * in both the MADT and ASL. */ static int acpi_pcpu_get_id(device_t dev, uint32_t *acpi_id, uint32_t *cpu_id) { struct pcpu *pc; uint32_t i, idx; KASSERT(acpi_id != NULL, ("Null acpi_id")); KASSERT(cpu_id != NULL, ("Null cpu_id")); idx = device_get_unit(dev); /* * If pc_acpi_id for CPU 0 is not initialized (e.g. a non-APIC * UP box) use the ACPI ID from the first processor we find. */ if (idx == 0 && mp_ncpus == 1) { pc = pcpu_find(0); if (pc->pc_acpi_id == 0xffffffff) pc->pc_acpi_id = *acpi_id; *cpu_id = 0; return (0); } CPU_FOREACH(i) { pc = pcpu_find(i); KASSERT(pc != NULL, ("no pcpu data for %d", i)); if (cpu_unordered) { if (idx-- == 0) { /* * If pc_acpi_id doesn't match the ACPI ID from the * ASL, prefer the MADT-derived value. */ if (pc->pc_acpi_id != *acpi_id) *acpi_id = pc->pc_acpi_id; *cpu_id = pc->pc_cpuid; return (0); } } else { if (pc->pc_acpi_id == *acpi_id) { if (bootverbose) device_printf(dev, "Processor %s (ACPI ID %u) -> APIC ID %d\n", acpi_name(acpi_get_handle(dev)), *acpi_id, pc->pc_cpuid); *cpu_id = pc->pc_cpuid; return (0); } } } if (bootverbose) printf("ACPI: Processor %s (ACPI ID %u) ignored\n", acpi_name(acpi_get_handle(dev)), *acpi_id); return (ESRCH); } static struct resource_list * acpi_cpu_get_rlist(device_t dev, device_t child) { struct acpi_cpu_device *ad; ad = device_get_ivars(child); if (ad == NULL) return (NULL); return (&ad->ad_rl); } static device_t acpi_cpu_add_child(device_t dev, u_int order, const char *name, int unit) { struct acpi_cpu_device *ad; device_t child; if ((ad = malloc(sizeof(*ad), M_TEMP, M_NOWAIT | M_ZERO)) == NULL) return (NULL); resource_list_init(&ad->ad_rl); child = device_add_child_ordered(dev, order, name, unit); if (child != NULL) device_set_ivars(child, ad); else free(ad, M_TEMP); return (child); } static int acpi_cpu_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct acpi_cpu_softc *sc; sc = device_get_softc(dev); switch (index) { case ACPI_IVAR_HANDLE: *result = (uintptr_t)sc->cpu_handle; break; case CPU_IVAR_PCPU: *result = (uintptr_t)sc->cpu_pcpu; break; #if defined(__amd64__) || defined(__i386__) case CPU_IVAR_NOMINAL_MHZ: if (tsc_is_invariant) { *result = (uintptr_t)(atomic_load_acq_64(&tsc_freq) / 1000000); break; } /* FALLTHROUGH */ #endif default: return (ENOENT); } return (0); } static int acpi_cpu_shutdown(device_t dev) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Allow children to shutdown first. */ bus_generic_shutdown(dev); /* * Disable any entry to the idle function. */ disable_idle(device_get_softc(dev)); /* * CPU devices are not truly detached and remain referenced, * so their resources are not freed. */ return_VALUE (0); } static void acpi_cpu_cx_probe(struct acpi_cpu_softc *sc) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Use initial sleep value of 1 sec. to start with lowest idle state. */ sc->cpu_prev_sleep = 1000000; sc->cpu_cx_lowest = 0; sc->cpu_cx_lowest_lim = 0; /* * Check for the ACPI 2.0 _CST sleep states object. If we can't find * any, we'll revert to generic FADT/P_BLK Cx control method which will * be handled by acpi_cpu_startup. We need to defer to after having * probed all the cpus in the system before probing for generic Cx * states as we may already have found cpus with valid _CST packages */ if (!cpu_cx_generic && acpi_cpu_cx_cst(sc) != 0) { /* * We were unable to find a _CST package for this cpu or there * was an error parsing it. Switch back to generic mode. */ cpu_cx_generic = TRUE; if (bootverbose) device_printf(sc->cpu_dev, "switching to generic Cx mode\n"); } /* * TODO: _CSD Package should be checked here. */ } static void acpi_cpu_generic_cx_probe(struct acpi_cpu_softc *sc) { ACPI_GENERIC_ADDRESS gas; struct acpi_cx *cx_ptr; sc->cpu_cx_count = 0; cx_ptr = sc->cpu_cx_states; /* Use initial sleep value of 1 sec. to start with lowest idle state. */ sc->cpu_prev_sleep = 1000000; /* C1 has been required since just after ACPI 1.0 */ cx_ptr->type = ACPI_STATE_C1; cx_ptr->trans_lat = 0; cx_ptr++; sc->cpu_non_c2 = sc->cpu_cx_count; sc->cpu_non_c3 = sc->cpu_cx_count; sc->cpu_cx_count++; /* * The spec says P_BLK must be 6 bytes long. However, some systems * use it to indicate a fractional set of features present so we * take 5 as C2. Some may also have a value of 7 to indicate * another C3 but most use _CST for this (as required) and having * "only" C1-C3 is not a hardship. */ if (sc->cpu_p_blk_len < 5) return; /* Validate and allocate resources for C2 (P_LVL2). */ gas.SpaceId = ACPI_ADR_SPACE_SYSTEM_IO; gas.BitWidth = 8; if (AcpiGbl_FADT.C2Latency <= 100) { gas.Address = sc->cpu_p_blk + 4; cx_ptr->res_rid = 0; acpi_bus_alloc_gas(sc->cpu_dev, &cx_ptr->res_type, &cx_ptr->res_rid, &gas, &cx_ptr->p_lvlx, RF_SHAREABLE); if (cx_ptr->p_lvlx != NULL) { cx_ptr->type = ACPI_STATE_C2; cx_ptr->trans_lat = AcpiGbl_FADT.C2Latency; cx_ptr++; sc->cpu_non_c3 = sc->cpu_cx_count; sc->cpu_cx_count++; } } if (sc->cpu_p_blk_len < 6) return; /* Validate and allocate resources for C3 (P_LVL3). */ if (AcpiGbl_FADT.C3Latency <= 1000 && !(cpu_quirks & CPU_QUIRK_NO_C3)) { gas.Address = sc->cpu_p_blk + 5; cx_ptr->res_rid = 1; acpi_bus_alloc_gas(sc->cpu_dev, &cx_ptr->res_type, &cx_ptr->res_rid, &gas, &cx_ptr->p_lvlx, RF_SHAREABLE); if (cx_ptr->p_lvlx != NULL) { cx_ptr->type = ACPI_STATE_C3; cx_ptr->trans_lat = AcpiGbl_FADT.C3Latency; cx_ptr++; sc->cpu_cx_count++; } } } #if defined(__i386__) || defined(__amd64__) static void acpi_cpu_cx_cst_mwait(struct acpi_cx *cx_ptr, uint64_t address, int accsize) { cx_ptr->do_mwait = true; cx_ptr->mwait_hint = address & 0xffffffff; cx_ptr->mwait_hw_coord = (accsize & CST_FFH_MWAIT_HW_COORD) != 0; cx_ptr->mwait_bm_avoidance = (accsize & CST_FFH_MWAIT_BM_AVOID) != 0; } #endif static void acpi_cpu_cx_cst_free_plvlx(device_t cpu_dev, struct acpi_cx *cx_ptr) { if (cx_ptr->p_lvlx == NULL) return; bus_release_resource(cpu_dev, cx_ptr->res_type, cx_ptr->res_rid, cx_ptr->p_lvlx); cx_ptr->p_lvlx = NULL; } /* * Parse a _CST package and set up its Cx states. Since the _CST object * can change dynamically, our notify handler may call this function * to clean up and probe the new _CST package. */ static int acpi_cpu_cx_cst(struct acpi_cpu_softc *sc) { struct acpi_cx *cx_ptr; ACPI_STATUS status; ACPI_BUFFER buf; ACPI_OBJECT *top; ACPI_OBJECT *pkg; uint32_t count; int i; #if defined(__i386__) || defined(__amd64__) uint64_t address; int vendor, class, accsize; #endif ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->cpu_handle, "_CST", NULL, &buf); if (ACPI_FAILURE(status)) return (ENXIO); /* _CST is a package with a count and at least one Cx package. */ top = (ACPI_OBJECT *)buf.Pointer; if (!ACPI_PKG_VALID(top, 2) || acpi_PkgInt32(top, 0, &count) != 0) { device_printf(sc->cpu_dev, "invalid _CST package\n"); AcpiOsFree(buf.Pointer); return (ENXIO); } if (count != top->Package.Count - 1) { device_printf(sc->cpu_dev, "invalid _CST state count (%d != %d)\n", count, top->Package.Count - 1); count = top->Package.Count - 1; } if (count > MAX_CX_STATES) { device_printf(sc->cpu_dev, "_CST has too many states (%d)\n", count); count = MAX_CX_STATES; } sc->cpu_non_c2 = 0; sc->cpu_non_c3 = 0; sc->cpu_cx_count = 0; cx_ptr = sc->cpu_cx_states; /* * C1 has been required since just after ACPI 1.0. * Reserve the first slot for it. */ cx_ptr->type = ACPI_STATE_C0; cx_ptr++; sc->cpu_cx_count++; /* Set up all valid states. */ for (i = 0; i < count; i++) { pkg = &top->Package.Elements[i + 1]; if (!ACPI_PKG_VALID(pkg, 4) || acpi_PkgInt32(pkg, 1, &cx_ptr->type) != 0 || acpi_PkgInt32(pkg, 2, &cx_ptr->trans_lat) != 0 || acpi_PkgInt32(pkg, 3, &cx_ptr->power) != 0) { device_printf(sc->cpu_dev, "skipping invalid Cx state package\n"); continue; } /* Validate the state to see if we should use it. */ switch (cx_ptr->type) { case ACPI_STATE_C1: acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr); #if defined(__i386__) || defined(__amd64__) if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address, &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL) { if (class == CST_FFH_INTEL_CL_C1IO) { /* C1 I/O then Halt */ cx_ptr->res_rid = sc->cpu_cx_count; bus_set_resource(sc->cpu_dev, SYS_RES_IOPORT, cx_ptr->res_rid, address, 1); cx_ptr->p_lvlx = bus_alloc_resource_any(sc->cpu_dev, SYS_RES_IOPORT, &cx_ptr->res_rid, RF_ACTIVE | RF_SHAREABLE); if (cx_ptr->p_lvlx == NULL) { bus_delete_resource(sc->cpu_dev, SYS_RES_IOPORT, cx_ptr->res_rid); device_printf(sc->cpu_dev, "C1 I/O failed to allocate port %d, " "degrading to C1 Halt", (int)address); } } else if (class == CST_FFH_INTEL_CL_MWAIT) { acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize); } } #endif if (sc->cpu_cx_states[0].type == ACPI_STATE_C0) { /* This is the first C1 state. Use the reserved slot. */ sc->cpu_cx_states[0] = *cx_ptr; } else { sc->cpu_non_c2 = sc->cpu_cx_count; sc->cpu_non_c3 = sc->cpu_cx_count; cx_ptr++; sc->cpu_cx_count++; } continue; case ACPI_STATE_C2: sc->cpu_non_c3 = sc->cpu_cx_count; break; case ACPI_STATE_C3: default: if ((cpu_quirks & CPU_QUIRK_NO_C3) != 0) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: C3[%d] not available.\n", device_get_unit(sc->cpu_dev), i)); continue; } break; } /* Free up any previous register. */ acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr); /* Allocate the control register for C2 or C3. */ #if defined(__i386__) || defined(__amd64__) if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address, &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL && class == CST_FFH_INTEL_CL_MWAIT) { /* Native C State Instruction use (mwait) */ acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: Got C%d/mwait - %d latency\n", device_get_unit(sc->cpu_dev), cx_ptr->type, cx_ptr->trans_lat)); cx_ptr++; sc->cpu_cx_count++; } else #endif { cx_ptr->res_rid = sc->cpu_cx_count; acpi_PkgGas(sc->cpu_dev, pkg, 0, &cx_ptr->res_type, &cx_ptr->res_rid, &cx_ptr->p_lvlx, RF_SHAREABLE); if (cx_ptr->p_lvlx) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: Got C%d - %d latency\n", device_get_unit(sc->cpu_dev), cx_ptr->type, cx_ptr->trans_lat)); cx_ptr++; sc->cpu_cx_count++; } } } AcpiOsFree(buf.Pointer); /* If C1 state was not found, we need one now. */ cx_ptr = sc->cpu_cx_states; if (cx_ptr->type == ACPI_STATE_C0) { cx_ptr->type = ACPI_STATE_C1; cx_ptr->trans_lat = 0; } return (0); } /* * Call this *after* all CPUs have been attached. */ static void acpi_cpu_startup(void *arg) { struct acpi_cpu_softc *sc; int i; /* Get set of CPU devices */ devclass_get_devices(acpi_cpu_devclass, &cpu_devices, &cpu_ndevices); /* * Setup any quirks that might necessary now that we have probed * all the CPUs */ acpi_cpu_quirks(); if (cpu_cx_generic) { /* * We are using generic Cx mode, probe for available Cx states * for all processors. */ for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); acpi_cpu_generic_cx_probe(sc); } } else { /* * We are using _CST mode, remove C3 state if necessary. * As we now know for sure that we will be using _CST mode * install our notify handler. */ for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); if (cpu_quirks & CPU_QUIRK_NO_C3) { sc->cpu_cx_count = min(sc->cpu_cx_count, sc->cpu_non_c3 + 1); } AcpiInstallNotifyHandler(sc->cpu_handle, ACPI_DEVICE_NOTIFY, acpi_cpu_notify, sc); } } /* Perform Cx final initialization. */ for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); acpi_cpu_startup_cx(sc); } /* Add a sysctl handler to handle global Cx lowest setting */ SYSCTL_ADD_PROC(&cpu_sysctl_ctx, SYSCTL_CHILDREN(cpu_sysctl_tree), OID_AUTO, "cx_lowest", CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, acpi_cpu_global_cx_lowest_sysctl, "A", "Global lowest Cx sleep state to use"); /* Take over idling from cpu_idle_default(). */ cpu_cx_lowest_lim = 0; for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); enable_idle(sc); } #if defined(__i386__) || defined(__amd64__) cpu_idle_hook = acpi_cpu_idle; #endif } static void acpi_cpu_cx_list(struct acpi_cpu_softc *sc) { struct sbuf sb; int i; /* * Set up the list of Cx states */ sbuf_new(&sb, sc->cpu_cx_supported, sizeof(sc->cpu_cx_supported), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) sbuf_printf(&sb, "C%d/%d/%d ", i + 1, sc->cpu_cx_states[i].type, sc->cpu_cx_states[i].trans_lat); sbuf_trim(&sb); sbuf_finish(&sb); } static void acpi_cpu_startup_cx(struct acpi_cpu_softc *sc) { acpi_cpu_cx_list(sc); SYSCTL_ADD_STRING(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_supported", CTLFLAG_RD, sc->cpu_cx_supported, 0, "Cx/microsecond values for supported Cx states"); SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_lowest", CTLTYPE_STRING | CTLFLAG_RW, (void *)sc, 0, acpi_cpu_cx_lowest_sysctl, "A", "lowest Cx sleep state to use"); SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_usage", CTLTYPE_STRING | CTLFLAG_RD, (void *)sc, 0, acpi_cpu_usage_sysctl, "A", "percent usage for each Cx state"); SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_usage_counters", CTLTYPE_STRING | CTLFLAG_RD, (void *)sc, 0, acpi_cpu_usage_counters_sysctl, "A", "Cx sleep state counters"); #if defined(__i386__) || defined(__amd64__) SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_method", CTLTYPE_STRING | CTLFLAG_RD, (void *)sc, 0, acpi_cpu_method_sysctl, "A", "Cx entrance methods"); #endif /* Signal platform that we can handle _CST notification. */ if (!cpu_cx_generic && cpu_cst_cnt != 0) { ACPI_LOCK(acpi); AcpiOsWritePort(cpu_smi_cmd, cpu_cst_cnt, 8); ACPI_UNLOCK(acpi); } } #if defined(__i386__) || defined(__amd64__) /* * Idle the CPU in the lowest state possible. This function is called with * interrupts disabled. Note that once it re-enables interrupts, a task * switch can occur so do not access shared data (i.e. the softc) after * interrupts are re-enabled. */ static void acpi_cpu_idle(sbintime_t sbt) { struct acpi_cpu_softc *sc; struct acpi_cx *cx_next; uint64_t cputicks; uint32_t start_time, end_time; ACPI_STATUS status; int bm_active, cx_next_idx, i, us; /* * Look up our CPU id to get our softc. If it's NULL, we'll use C1 * since there is no ACPI processor object for this CPU. This occurs * for logical CPUs in the HTT case. */ sc = cpu_softc[PCPU_GET(cpuid)]; if (sc == NULL) { acpi_cpu_c1(); return; } /* If disabled, take the safe path. */ if (is_idle_disabled(sc)) { acpi_cpu_c1(); return; } /* Find the lowest state that has small enough latency. */ us = sc->cpu_prev_sleep; if (sbt >= 0 && us > (sbt >> 12)) us = (sbt >> 12); cx_next_idx = 0; if (cpu_disable_c2_sleep) i = min(sc->cpu_cx_lowest, sc->cpu_non_c2); else if (cpu_disable_c3_sleep) i = min(sc->cpu_cx_lowest, sc->cpu_non_c3); else i = sc->cpu_cx_lowest; for (; i >= 0; i--) { if (sc->cpu_cx_states[i].trans_lat * 3 <= us) { cx_next_idx = i; break; } } /* * Check for bus master activity. If there was activity, clear * the bit and use the lowest non-C3 state. Note that the USB * driver polling for new devices keeps this bit set all the * time if USB is loaded. */ if ((cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0 && cx_next_idx > sc->cpu_non_c3) { status = AcpiReadBitRegister(ACPI_BITREG_BUS_MASTER_STATUS, &bm_active); if (ACPI_SUCCESS(status) && bm_active != 0) { AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_STATUS, 1); cx_next_idx = sc->cpu_non_c3; } } /* Select the next state and update statistics. */ cx_next = &sc->cpu_cx_states[cx_next_idx]; sc->cpu_cx_stats[cx_next_idx]++; KASSERT(cx_next->type != ACPI_STATE_C0, ("acpi_cpu_idle: C0 sleep")); /* * Execute HLT (or equivalent) and wait for an interrupt. We can't * precisely calculate the time spent in C1 since the place we wake up * is an ISR. Assume we slept no more then half of quantum, unless * we are called inside critical section, delaying context switch. */ if (cx_next->type == ACPI_STATE_C1) { cputicks = cpu_ticks(); if (cx_next->p_lvlx != NULL) { /* C1 I/O then Halt */ CPU_GET_REG(cx_next->p_lvlx, 1); } if (cx_next->do_mwait) acpi_cpu_idle_mwait(cx_next->mwait_hint); else acpi_cpu_c1(); end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate(); if (curthread->td_critnest == 0) end_time = min(end_time, 500000 / hz); sc->cpu_prev_sleep = (sc->cpu_prev_sleep * 3 + end_time) / 4; return; } /* * For C3, disable bus master arbitration and enable bus master wake * if BM control is available, otherwise flush the CPU cache. */ if (cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) { if ((cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) { AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 1); AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 1); } else ACPI_FLUSH_CPU_CACHE(); } /* * Read from P_LVLx to enter C2(+), checking time spent asleep. * Use the ACPI timer for measuring sleep time. Since we need to * get the time very close to the CPU start/stop clock logic, this * is the only reliable time source. */ if (cx_next->type == ACPI_STATE_C3) { AcpiGetTimer(&start_time); cputicks = 0; } else { start_time = 0; cputicks = cpu_ticks(); } if (cx_next->do_mwait) acpi_cpu_idle_mwait(cx_next->mwait_hint); else CPU_GET_REG(cx_next->p_lvlx, 1); /* * Read the end time twice. Since it may take an arbitrary time * to enter the idle state, the first read may be executed before * the processor has stopped. Doing it again provides enough * margin that we are certain to have a correct value. */ AcpiGetTimer(&end_time); if (cx_next->type == ACPI_STATE_C3) { AcpiGetTimer(&end_time); AcpiGetTimerDuration(start_time, end_time, &end_time); } else end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate(); /* Enable bus master arbitration and disable bus master wakeup. */ if ((cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) && (cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) { AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 0); AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 0); } ACPI_ENABLE_IRQS(); sc->cpu_prev_sleep = (sc->cpu_prev_sleep * 3 + PM_USEC(end_time)) / 4; } #endif /* * Re-evaluate the _CST object when we are notified that it changed. */ static void acpi_cpu_notify(ACPI_HANDLE h, UINT32 notify, void *context) { struct acpi_cpu_softc *sc = (struct acpi_cpu_softc *)context; if (notify != ACPI_NOTIFY_CX_STATES) return; /* * C-state data for target CPU is going to be in flux while we execute * acpi_cpu_cx_cst, so disable entering acpi_cpu_idle. * Also, it may happen that multiple ACPI taskqueues may concurrently * execute notifications for the same CPU. ACPI_SERIAL is used to * protect against that. */ ACPI_SERIAL_BEGIN(cpu); disable_idle(sc); /* Update the list of Cx states. */ acpi_cpu_cx_cst(sc); acpi_cpu_cx_list(sc); acpi_cpu_set_cx_lowest(sc); enable_idle(sc); ACPI_SERIAL_END(cpu); acpi_UserNotify("PROCESSOR", sc->cpu_handle, notify); } static void acpi_cpu_quirks(void) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* * Bus mastering arbitration control is needed to keep caches coherent * while sleeping in C3. If it's not present but a working flush cache * instruction is present, flush the caches before entering C3 instead. * Otherwise, just disable C3 completely. */ if (AcpiGbl_FADT.Pm2ControlBlock == 0 || AcpiGbl_FADT.Pm2ControlLength == 0) { if ((AcpiGbl_FADT.Flags & ACPI_FADT_WBINVD) && (AcpiGbl_FADT.Flags & ACPI_FADT_WBINVD_FLUSH) == 0) { cpu_quirks |= CPU_QUIRK_NO_BM_CTRL; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: no BM control, using flush cache method\n")); } else { cpu_quirks |= CPU_QUIRK_NO_C3; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: no BM control, C3 not available\n")); } } /* * If we are using generic Cx mode, C3 on multiple CPUs requires using * the expensive flush cache instruction. */ if (cpu_cx_generic && mp_ncpus > 1) { cpu_quirks |= CPU_QUIRK_NO_BM_CTRL; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: SMP, using flush cache mode for C3\n")); } /* Look for various quirks of the PIIX4 part. */ acpi_cpu_quirks_piix4(); } static void acpi_cpu_quirks_piix4(void) { #ifdef __i386__ device_t acpi_dev; uint32_t val; ACPI_STATUS status; acpi_dev = pci_find_device(PCI_VENDOR_INTEL, PCI_DEVICE_82371AB_3); if (acpi_dev != NULL) { switch (pci_get_revid(acpi_dev)) { /* * Disable C3 support for all PIIX4 chipsets. Some of these parts * do not report the BMIDE status to the BM status register and * others have a livelock bug if Type-F DMA is enabled. Linux * works around the BMIDE bug by reading the BM status directly * but we take the simpler approach of disabling C3 for these * parts. * * See erratum #18 ("C3 Power State/BMIDE and Type-F DMA * Livelock") from the January 2002 PIIX4 specification update. * Applies to all PIIX4 models. * * Also, make sure that all interrupts cause a "Stop Break" * event to exit from C2 state. * Also, BRLD_EN_BM (ACPI_BITREG_BUS_MASTER_RLD in ACPI-speak) * should be set to zero, otherwise it causes C2 to short-sleep. * PIIX4 doesn't properly support C3 and bus master activity * need not break out of C2. */ case PCI_REVISION_A_STEP: case PCI_REVISION_B_STEP: case PCI_REVISION_4E: case PCI_REVISION_4M: cpu_quirks |= CPU_QUIRK_NO_C3; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: working around PIIX4 bug, disabling C3\n")); val = pci_read_config(acpi_dev, PIIX4_DEVACTB_REG, 4); if ((val & PIIX4_STOP_BREAK_MASK) != PIIX4_STOP_BREAK_MASK) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: PIIX4: enabling IRQs to generate Stop Break\n")); val |= PIIX4_STOP_BREAK_MASK; pci_write_config(acpi_dev, PIIX4_DEVACTB_REG, val, 4); } status = AcpiReadBitRegister(ACPI_BITREG_BUS_MASTER_RLD, &val); if (ACPI_SUCCESS(status) && val != 0) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: PIIX4: reset BRLD_EN_BM\n")); AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 0); } break; default: break; } } #endif } static int acpi_cpu_usage_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; struct sbuf sb; char buf[128]; int i; uintmax_t fract, sum, whole; sc = (struct acpi_cpu_softc *) arg1; sum = 0; for (i = 0; i < sc->cpu_cx_count; i++) sum += sc->cpu_cx_stats[i]; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) { if (sum > 0) { whole = (uintmax_t)sc->cpu_cx_stats[i] * 100; fract = (whole % sum) * 100; sbuf_printf(&sb, "%u.%02u%% ", (u_int)(whole / sum), (u_int)(fract / sum)); } else sbuf_printf(&sb, "0.00%% "); } sbuf_printf(&sb, "last %dus", sc->cpu_prev_sleep); sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } /* * XXX TODO: actually add support to count each entry/exit * from the Cx states. */ static int acpi_cpu_usage_counters_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; struct sbuf sb; char buf[128]; int i; sc = (struct acpi_cpu_softc *) arg1; /* Print out the raw counters */ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) { sbuf_printf(&sb, "%u ", sc->cpu_cx_stats[i]); } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } #if defined(__i386__) || defined(__amd64__) static int acpi_cpu_method_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; struct acpi_cx *cx; struct sbuf sb; char buf[128]; int i; sc = (struct acpi_cpu_softc *)arg1; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) { cx = &sc->cpu_cx_states[i]; sbuf_printf(&sb, "C%d/", i + 1); if (cx->do_mwait) { sbuf_cat(&sb, "mwait"); if (cx->mwait_hw_coord) sbuf_cat(&sb, "/hwc"); if (cx->mwait_bm_avoidance) sbuf_cat(&sb, "/bma"); } else if (cx->type == ACPI_STATE_C1) { sbuf_cat(&sb, "hlt"); } else { sbuf_cat(&sb, "io"); } if (cx->type == ACPI_STATE_C1 && cx->p_lvlx != NULL) sbuf_cat(&sb, "/iohlt"); sbuf_putc(&sb, ' '); } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } #endif static int acpi_cpu_set_cx_lowest(struct acpi_cpu_softc *sc) { int i; ACPI_SERIAL_ASSERT(cpu); sc->cpu_cx_lowest = min(sc->cpu_cx_lowest_lim, sc->cpu_cx_count - 1); /* If not disabling, cache the new lowest non-C3 state. */ sc->cpu_non_c3 = 0; for (i = sc->cpu_cx_lowest; i >= 0; i--) { if (sc->cpu_cx_states[i].type < ACPI_STATE_C3) { sc->cpu_non_c3 = i; break; } } /* Reset the statistics counters. */ bzero(sc->cpu_cx_stats, sizeof(sc->cpu_cx_stats)); return (0); } static int acpi_cpu_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; char state[8]; int val, error; sc = (struct acpi_cpu_softc *) arg1; snprintf(state, sizeof(state), "C%d", sc->cpu_cx_lowest_lim + 1); error = sysctl_handle_string(oidp, state, sizeof(state), req); if (error != 0 || req->newptr == NULL) return (error); if (strlen(state) < 2 || toupper(state[0]) != 'C') return (EINVAL); if (strcasecmp(state, "Cmax") == 0) val = MAX_CX_STATES; else { val = (int) strtol(state + 1, NULL, 10); if (val < 1 || val > MAX_CX_STATES) return (EINVAL); } ACPI_SERIAL_BEGIN(cpu); sc->cpu_cx_lowest_lim = val - 1; acpi_cpu_set_cx_lowest(sc); ACPI_SERIAL_END(cpu); return (0); } static int acpi_cpu_global_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; char state[8]; int val, error, i; snprintf(state, sizeof(state), "C%d", cpu_cx_lowest_lim + 1); error = sysctl_handle_string(oidp, state, sizeof(state), req); if (error != 0 || req->newptr == NULL) return (error); if (strlen(state) < 2 || toupper(state[0]) != 'C') return (EINVAL); if (strcasecmp(state, "Cmax") == 0) val = MAX_CX_STATES; else { val = (int) strtol(state + 1, NULL, 10); if (val < 1 || val > MAX_CX_STATES) return (EINVAL); } /* Update the new lowest useable Cx state for all CPUs. */ ACPI_SERIAL_BEGIN(cpu); cpu_cx_lowest_lim = val - 1; for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); sc->cpu_cx_lowest_lim = cpu_cx_lowest_lim; acpi_cpu_set_cx_lowest(sc); } ACPI_SERIAL_END(cpu); return (0); } Index: stable/11/sys/i386/i386/pmap.c =================================================================== --- stable/11/sys/i386/i386/pmap.c (revision 328385) +++ stable/11/sys/i386/i386/pmap.c (revision 328386) @@ -1,5713 +1,5713 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_vm.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = NULL; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2; static pt_entry_t *PADDR1 = NULL, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } void invltlb_glob(void) { uint64_t cr4; if (pgeflag == 0) { invltlb(); } else { cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb_glob(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, - smp_no_rendevous_barrier, pmap == kernel_pmap ? + smp_no_rendezvous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was * created by a promotion that did not invalidate the 512 or 1024 4KB * page mappings that might exist in the TLB. Consequently, at this * point, the TLB may hold both 4KB and 2- or 4MB page mappings for * the address range [va, va + NBPDR). Therefore, the entire range * must be invalidated here. In contrast, when PG_PROMOTED is clear, * the TLB will not hold any 4KB page mappings for the address range * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the * 2- or 4MB page mapping from the TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || pmap == vmspace_pmap(curthread->td_proc->p_vmspace)); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; int count; for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } atomic_subtract_int(&vm_cnt.v_wire_count, count); } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; vm_page_free_zero(m); } atomic_subtract_int(&vm_cnt.v_wire_count, NPGPTD); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); pde = pmap_pde(pmap, va); if (va < VM_MAXUSER_ADDRESS) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); #endif sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page_idle: CMAP3 busy"); sched_pin(); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(CADDR3); pagezero(CADDR3); *CMAP3 = 0; sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif Index: stable/11/sys/i386/include/counter.h =================================================================== --- stable/11/sys/i386/include/counter.h (revision 328385) +++ stable/11/sys/i386/include/counter.h (revision 328386) @@ -1,178 +1,178 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif #include #include #define counter_enter() do { \ if ((cpu_feature & CPUID_CX8) == 0) \ critical_enter(); \ } while (0) #define counter_exit() do { \ if ((cpu_feature & CPUID_CX8) == 0) \ critical_exit(); \ } while (0) extern struct pcpu __pcpu[MAXCPU]; static inline void counter_64_inc_8b(uint64_t *p, int64_t inc) { __asm __volatile( "movl %%fs:(%%esi),%%eax\n\t" "movl %%fs:4(%%esi),%%edx\n" "1:\n\t" "movl %%eax,%%ebx\n\t" "movl %%edx,%%ecx\n\t" "addl (%%edi),%%ebx\n\t" "adcl 4(%%edi),%%ecx\n\t" "cmpxchg8b %%fs:(%%esi)\n\t" "jnz 1b" : : "S" ((char *)p - (char *)&__pcpu[0]), "D" (&inc) : "memory", "cc", "eax", "edx", "ebx", "ecx"); } #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one_8b(uint64_t *p) { uint32_t res_lo, res_high; __asm __volatile( "movl %%eax,%%ebx\n\t" "movl %%edx,%%ecx\n\t" "cmpxchg8b (%2)" : "=a" (res_lo), "=d"(res_high) : "SD" (p) : "cc", "ebx", "ecx"); return (res_lo + ((uint64_t)res_high << 32)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t res; int i; res = 0; if ((cpu_feature & CPUID_CX8) == 0) { /* * The machines without cmpxchg8b are not SMP. * Disabling the preemption provides atomicity of the * counter reading, since update is done in the * critical section as well. */ critical_enter(); CPU_FOREACH(i) { res += *(uint64_t *)((char *)p + sizeof(struct pcpu) * i); } critical_exit(); } else { CPU_FOREACH(i) res += counter_u64_read_one_8b((uint64_t *)((char *)p + sizeof(struct pcpu) * i)); } return (res); } static inline void counter_u64_zero_one_8b(uint64_t *p) { __asm __volatile( "movl (%0),%%eax\n\t" "movl 4(%0),%%edx\n" "xorl %%ebx,%%ebx\n\t" "xorl %%ecx,%%ecx\n\t" "1:\n\t" "cmpxchg8b (%0)\n\t" "jnz 1b" : : "SD" (p) : "memory", "cc", "eax", "edx", "ebx", "ecx"); } static void counter_u64_zero_one_cpu(void *arg) { uint64_t *p; p = (uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid)); counter_u64_zero_one_8b(p); } static inline void counter_u64_zero_inline(counter_u64_t c) { int i; if ((cpu_feature & CPUID_CX8) == 0) { critical_enter(); CPU_FOREACH(i) *(uint64_t *)((char *)c + sizeof(struct pcpu) * i) = 0; critical_exit(); } else { - smp_rendezvous(smp_no_rendevous_barrier, - counter_u64_zero_one_cpu, smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, + counter_u64_zero_one_cpu, smp_no_rendezvous_barrier, c); } } #endif #define counter_u64_add_protected(c, inc) do { \ if ((cpu_feature & CPUID_CX8) == 0) { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } else \ counter_64_inc_8b((c), (inc)); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { if ((cpu_feature & CPUID_CX8) == 0) { critical_enter(); *(uint64_t *)zpcpu_get(c) += inc; critical_exit(); } else { counter_64_inc_8b(c, inc); } } #endif /* ! __MACHINE_COUNTER_H__ */ Index: stable/11/sys/kern/kern_rmlock.c =================================================================== --- stable/11/sys/kern/kern_rmlock.c (revision 328385) +++ stable/11/sys/kern/kern_rmlock.c (revision 328386) @@ -1,853 +1,853 @@ /*- * Copyright (c) 2007 Stephan Uphoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Machine independent bits of reader/writer lock implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * A cookie to mark destroyed rmlocks. This is stored in the head of * rm_activeReaders. */ #define RM_DESTROYED ((void *)0xdead) #define rm_destroyed(rm) \ (LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED) #define RMPF_ONQUEUE 1 #define RMPF_SIGNAL 2 #ifndef INVARIANTS #define _rm_assert(c, what, file, line) #endif static void assert_rm(const struct lock_object *lock, int what); #ifdef DDB static void db_show_rm(const struct lock_object *lock); #endif static void lock_rm(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_rm(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_rm(struct lock_object *lock); struct lock_class lock_class_rm = { .lc_name = "rm", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, .lc_assert = assert_rm, #ifdef DDB .lc_ddb_show = db_show_rm, #endif .lc_lock = lock_rm, .lc_unlock = unlock_rm, #ifdef KDTRACE_HOOKS .lc_owner = owner_rm, #endif }; struct lock_class lock_class_rm_sleepable = { .lc_name = "sleepable rm", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE, .lc_assert = assert_rm, #ifdef DDB .lc_ddb_show = db_show_rm, #endif .lc_lock = lock_rm, .lc_unlock = unlock_rm, #ifdef KDTRACE_HOOKS .lc_owner = owner_rm, #endif }; static void assert_rm(const struct lock_object *lock, int what) { rm_assert((const struct rmlock *)lock, what); } static void lock_rm(struct lock_object *lock, uintptr_t how) { struct rmlock *rm; struct rm_priotracker *tracker; rm = (struct rmlock *)lock; if (how == 0) rm_wlock(rm); else { tracker = (struct rm_priotracker *)how; rm_rlock(rm, tracker); } } static uintptr_t unlock_rm(struct lock_object *lock) { struct thread *td; struct pcpu *pc; struct rmlock *rm; struct rm_queue *queue; struct rm_priotracker *tracker; uintptr_t how; rm = (struct rmlock *)lock; tracker = NULL; how = 0; rm_assert(rm, RA_LOCKED | RA_NOTRECURSED); if (rm_wowned(rm)) rm_wunlock(rm); else { /* * Find the right rm_priotracker structure for curthread. * The guarantee about its uniqueness is given by the fact * we already asserted the lock wasn't recursively acquired. */ critical_enter(); td = curthread; pc = pcpu_find(curcpu); for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tracker = (struct rm_priotracker *)queue; if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td)) { how = (uintptr_t)tracker; break; } } KASSERT(tracker != NULL, ("rm_priotracker is non-NULL when lock held in read mode")); critical_exit(); rm_runlock(rm, tracker); } return (how); } #ifdef KDTRACE_HOOKS static int owner_rm(const struct lock_object *lock, struct thread **owner) { const struct rmlock *rm; struct lock_class *lc; rm = (const struct rmlock *)lock; lc = LOCK_CLASS(&rm->rm_wlock_object); return (lc->lc_owner(&rm->rm_wlock_object, owner)); } #endif static struct mtx rm_spinlock; MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN); /* * Add or remove tracker from per-cpu list. * * The per-cpu list can be traversed at any time in forward direction from an * interrupt on the *local* cpu. */ static void inline rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker) { struct rm_queue *next; /* Initialize all tracker pointers */ tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue; next = pc->pc_rm_queue.rmq_next; tracker->rmp_cpuQueue.rmq_next = next; /* rmq_prev is not used during froward traversal. */ next->rmq_prev = &tracker->rmp_cpuQueue; /* Update pointer to first element. */ pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue; } /* * Return a count of the number of trackers the thread 'td' already * has on this CPU for the lock 'rm'. */ static int rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm, const struct thread *td) { struct rm_queue *queue; struct rm_priotracker *tracker; int count; count = 0; for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tracker = (struct rm_priotracker *)queue; if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td)) count++; } return (count); } static void inline rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker) { struct rm_queue *next, *prev; next = tracker->rmp_cpuQueue.rmq_next; prev = tracker->rmp_cpuQueue.rmq_prev; /* Not used during forward traversal. */ next->rmq_prev = prev; /* Remove from list. */ prev->rmq_next = next; } static void rm_cleanIPI(void *arg) { struct pcpu *pc; struct rmlock *rm = arg; struct rm_priotracker *tracker; struct rm_queue *queue; pc = pcpu_find(curcpu); for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tracker = (struct rm_priotracker *)queue; if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) { tracker->rmp_flags = RMPF_ONQUEUE; mtx_lock_spin(&rm_spinlock); LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker, rmp_qentry); mtx_unlock_spin(&rm_spinlock); } } } void rm_init_flags(struct rmlock *rm, const char *name, int opts) { struct lock_class *lc; int liflags, xflags; liflags = 0; if (!(opts & RM_NOWITNESS)) liflags |= LO_WITNESS; if (opts & RM_RECURSE) liflags |= LO_RECURSABLE; if (opts & RM_NEW) liflags |= LO_NEW; rm->rm_writecpus = all_cpus; LIST_INIT(&rm->rm_activeReaders); if (opts & RM_SLEEPABLE) { liflags |= LO_SLEEPABLE; lc = &lock_class_rm_sleepable; xflags = (opts & RM_NEW ? SX_NEW : 0); sx_init_flags(&rm->rm_lock_sx, "rmlock_sx", xflags | SX_NOWITNESS); } else { lc = &lock_class_rm; xflags = (opts & RM_NEW ? MTX_NEW : 0); mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx", xflags | MTX_NOWITNESS); } lock_init(&rm->lock_object, lc, name, NULL, liflags); } void rm_init(struct rmlock *rm, const char *name) { rm_init_flags(rm, name, 0); } void rm_destroy(struct rmlock *rm) { rm_assert(rm, RA_UNLOCKED); LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED; if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_destroy(&rm->rm_lock_sx); else mtx_destroy(&rm->rm_lock_mtx); lock_destroy(&rm->lock_object); } int rm_wowned(const struct rmlock *rm) { if (rm->lock_object.lo_flags & LO_SLEEPABLE) return (sx_xlocked(&rm->rm_lock_sx)); else return (mtx_owned(&rm->rm_lock_mtx)); } void rm_sysinit(void *arg) { struct rm_args *args; args = arg; rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags); } static int _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) { struct pcpu *pc; critical_enter(); pc = pcpu_find(curcpu); /* Check if we just need to do a proper critical_exit. */ if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) { critical_exit(); return (1); } /* Remove our tracker from the per-cpu list. */ rm_tracker_remove(pc, tracker); /* Check to see if the IPI granted us the lock after all. */ if (tracker->rmp_flags) { /* Just add back tracker - we hold the lock. */ rm_tracker_add(pc, tracker); critical_exit(); return (1); } /* * We allow readers to acquire a lock even if a writer is blocked if * the lock is recursive and the reader already holds the lock. */ if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) { /* * Just grant the lock if this thread already has a tracker * for this lock on the per-cpu queue. */ if (rm_trackers_present(pc, rm, curthread) != 0) { mtx_lock_spin(&rm_spinlock); LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker, rmp_qentry); tracker->rmp_flags = RMPF_ONQUEUE; mtx_unlock_spin(&rm_spinlock); rm_tracker_add(pc, tracker); critical_exit(); return (1); } } sched_unpin(); critical_exit(); if (trylock) { if (rm->lock_object.lo_flags & LO_SLEEPABLE) { if (!sx_try_xlock(&rm->rm_lock_sx)) return (0); } else { if (!mtx_trylock(&rm->rm_lock_mtx)) return (0); } } else { if (rm->lock_object.lo_flags & LO_SLEEPABLE) { THREAD_SLEEPING_OK(); sx_xlock(&rm->rm_lock_sx); THREAD_NO_SLEEPING(); } else mtx_lock(&rm->rm_lock_mtx); } critical_enter(); pc = pcpu_find(curcpu); CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus); rm_tracker_add(pc, tracker); sched_pin(); critical_exit(); if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_xunlock(&rm->rm_lock_sx); else mtx_unlock(&rm->rm_lock_mtx); return (1); } int _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock) { struct thread *td = curthread; struct pcpu *pc; if (SCHEDULER_STOPPED()) return (1); tracker->rmp_flags = 0; tracker->rmp_thread = td; tracker->rmp_rmlock = rm; if (rm->lock_object.lo_flags & LO_SLEEPABLE) THREAD_NO_SLEEPING(); td->td_critnest++; /* critical_enter(); */ __compiler_membar(); pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */ rm_tracker_add(pc, tracker); sched_pin(); __compiler_membar(); td->td_critnest--; /* * Fast path to combine two common conditions into a single * conditional jump. */ if (0 == (td->td_owepreempt | CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))) return (1); /* We do not have a read token and need to acquire one. */ return _rm_rlock_hard(rm, tracker, trylock); } static void _rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker) { if (td->td_owepreempt) { td->td_critnest++; critical_exit(); } if (!tracker->rmp_flags) return; mtx_lock_spin(&rm_spinlock); LIST_REMOVE(tracker, rmp_qentry); if (tracker->rmp_flags & RMPF_SIGNAL) { struct rmlock *rm; struct turnstile *ts; rm = tracker->rmp_rmlock; turnstile_chain_lock(&rm->lock_object); mtx_unlock_spin(&rm_spinlock); ts = turnstile_lookup(&rm->lock_object); turnstile_signal(ts, TS_EXCLUSIVE_QUEUE); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&rm->lock_object); } else mtx_unlock_spin(&rm_spinlock); } void _rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker) { struct pcpu *pc; struct thread *td = tracker->rmp_thread; if (SCHEDULER_STOPPED()) return; td->td_critnest++; /* critical_enter(); */ pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */ rm_tracker_remove(pc, tracker); td->td_critnest--; sched_unpin(); if (rm->lock_object.lo_flags & LO_SLEEPABLE) THREAD_SLEEPING_OK(); if (0 == (td->td_owepreempt | tracker->rmp_flags)) return; _rm_unlock_hard(td, tracker); } void _rm_wlock(struct rmlock *rm) { struct rm_priotracker *prio; struct turnstile *ts; cpuset_t readcpus; if (SCHEDULER_STOPPED()) return; if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_xlock(&rm->rm_lock_sx); else mtx_lock(&rm->rm_lock_mtx); if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) { /* Get all read tokens back */ readcpus = all_cpus; CPU_NAND(&readcpus, &rm->rm_writecpus); rm->rm_writecpus = all_cpus; /* * Assumes rm->rm_writecpus update is visible on other CPUs * before rm_cleanIPI is called. */ #ifdef SMP smp_rendezvous_cpus(readcpus, - smp_no_rendevous_barrier, + smp_no_rendezvous_barrier, rm_cleanIPI, - smp_no_rendevous_barrier, + smp_no_rendezvous_barrier, rm); #else rm_cleanIPI(rm); #endif mtx_lock_spin(&rm_spinlock); while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) { ts = turnstile_trywait(&rm->lock_object); prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL; mtx_unlock_spin(&rm_spinlock); turnstile_wait(ts, prio->rmp_thread, TS_EXCLUSIVE_QUEUE); mtx_lock_spin(&rm_spinlock); } mtx_unlock_spin(&rm_spinlock); } } void _rm_wunlock(struct rmlock *rm) { if (rm->lock_object.lo_flags & LO_SLEEPABLE) sx_xunlock(&rm->rm_lock_sx); else mtx_unlock(&rm->rm_lock_mtx); } #if LOCK_DEBUG > 0 void _rm_wlock_debug(struct rmlock *rm, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d", curthread, rm->lock_object.lo_name, file, line)); KASSERT(!rm_destroyed(rm), ("rm_wlock() of destroyed rmlock @ %s:%d", file, line)); _rm_assert(rm, RA_UNLOCKED, file, line); WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); _rm_wlock(rm); LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line); WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } void _rm_wunlock_debug(struct rmlock *rm, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(!rm_destroyed(rm), ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line)); _rm_assert(rm, RA_WLOCKED, file, line); WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line); _rm_wunlock(rm); TD_LOCKS_DEC(curthread); } int _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, int trylock, const char *file, int line) { if (SCHEDULER_STOPPED()) return (1); #ifdef INVARIANTS if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) { critical_enter(); KASSERT(rm_trackers_present(pcpu_find(curcpu), rm, curthread) == 0, ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n", rm->lock_object.lo_name, file, line)); critical_exit(); } #endif KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d", curthread, rm->lock_object.lo_name, file, line)); KASSERT(!rm_destroyed(rm), ("rm_rlock() of destroyed rmlock @ %s:%d", file, line)); if (!trylock) { KASSERT(!rm_wowned(rm), ("rm_rlock: wlock already held for %s @ %s:%d", rm->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER, file, line, NULL); } if (_rm_rlock(rm, tracker, trylock)) { if (trylock) LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file, line); else LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file, line); WITNESS_LOCK(&rm->lock_object, 0, file, line); TD_LOCKS_INC(curthread); return (1); } else if (trylock) LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line); return (0); } void _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(!rm_destroyed(rm), ("rm_runlock() of destroyed rmlock @ %s:%d", file, line)); _rm_assert(rm, RA_RLOCKED, file, line); WITNESS_UNLOCK(&rm->lock_object, 0, file, line); LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line); _rm_runlock(rm, tracker); TD_LOCKS_DEC(curthread); } #else /* * Just strip out file and line arguments if no lock debugging is enabled in * the kernel - we are called from a kernel module. */ void _rm_wlock_debug(struct rmlock *rm, const char *file, int line) { _rm_wlock(rm); } void _rm_wunlock_debug(struct rmlock *rm, const char *file, int line) { _rm_wunlock(rm); } int _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, int trylock, const char *file, int line) { return _rm_rlock(rm, tracker, trylock); } void _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker, const char *file, int line) { _rm_runlock(rm, tracker); } #endif #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _rm_assert #endif /* * Note that this does not need to use witness_assert() for read lock * assertions since an exact count of read locks held by this thread * is computable. */ void _rm_assert(const struct rmlock *rm, int what, const char *file, int line) { int count; if (panicstr != NULL) return; switch (what) { case RA_LOCKED: case RA_LOCKED | RA_RECURSED: case RA_LOCKED | RA_NOTRECURSED: case RA_RLOCKED: case RA_RLOCKED | RA_RECURSED: case RA_RLOCKED | RA_NOTRECURSED: /* * Handle the write-locked case. Unlike other * primitives, writers can never recurse. */ if (rm_wowned(rm)) { if (what & RA_RLOCKED) panic("Lock %s exclusively locked @ %s:%d\n", rm->lock_object.lo_name, file, line); if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rm->lock_object.lo_name, file, line); break; } critical_enter(); count = rm_trackers_present(pcpu_find(curcpu), rm, curthread); critical_exit(); if (count == 0) panic("Lock %s not %slocked @ %s:%d\n", rm->lock_object.lo_name, (what & RA_RLOCKED) ? "read " : "", file, line); if (count > 1) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rm->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rm->lock_object.lo_name, file, line); break; case RA_WLOCKED: if (!rm_wowned(rm)) panic("Lock %s not exclusively locked @ %s:%d\n", rm->lock_object.lo_name, file, line); break; case RA_UNLOCKED: if (rm_wowned(rm)) panic("Lock %s exclusively locked @ %s:%d\n", rm->lock_object.lo_name, file, line); critical_enter(); count = rm_trackers_present(pcpu_find(curcpu), rm, curthread); critical_exit(); if (count != 0) panic("Lock %s read locked @ %s:%d\n", rm->lock_object.lo_name, file, line); break; default: panic("Unknown rm lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void print_tracker(struct rm_priotracker *tr) { struct thread *td; td = tr->rmp_thread; db_printf(" thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (tr->rmp_flags & RMPF_ONQUEUE) { db_printf("ONQUEUE"); if (tr->rmp_flags & RMPF_SIGNAL) db_printf(",SIGNAL"); } else db_printf("0"); db_printf("}\n"); } static void db_show_rm(const struct lock_object *lock) { struct rm_priotracker *tr; struct rm_queue *queue; const struct rmlock *rm; struct lock_class *lc; struct pcpu *pc; rm = (const struct rmlock *)lock; db_printf(" writecpus: "); ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus)); db_printf("\n"); db_printf(" per-CPU readers:\n"); STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue; queue = queue->rmq_next) { tr = (struct rm_priotracker *)queue; if (tr->rmp_rmlock == rm) print_tracker(tr); } db_printf(" active readers:\n"); LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry) print_tracker(tr); lc = LOCK_CLASS(&rm->rm_wlock_object); db_printf("Backing write-lock (%s):\n", lc->lc_name); lc->lc_ddb_show(&rm->rm_wlock_object); } #endif Index: stable/11/sys/kern/subr_smp.c =================================================================== --- stable/11/sys/kern/subr_smp.c (revision 328385) +++ stable/11/sys/kern/subr_smp.c (revision 328386) @@ -1,1155 +1,1162 @@ /*- * Copyright (c) 2001, John Baldwin . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and machine independent functions * used for the kernel SMP support. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_sched.h" #ifdef SMP MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; cpuset_t hlt_cpus_mask; cpuset_t logical_cpus_mask; void (*cpustop_restartfunc)(void); #endif static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); /* This is used in modules that need to work in both SMP and UP. */ cpuset_t all_cpus; int mp_ncpus; /* export this for libkvm consumers. */ int mp_maxcpus = MAXCPU; volatile int smp_started; u_int mp_maxid; static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, "Kernel SMP"); SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, "Max CPU ID."); SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 0, "Max number of CPUs that the system was compiled for."); SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0, sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode"); int smp_disabled = 0; /* has smp been disabled? */ SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, &smp_disabled, 0, "SMP has been disabled from the loader"); int smp_cpus = 1; /* how many cpu's running */ SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, "Number of CPUs online"); int smp_topology = 0; /* Which topology we're using. */ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0, "Topology override setting; 0 is default provided by hardware."); #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, &forward_signal_enabled, 0, "Forwarding of a signal to a process on a different CPU"); /* Variables needed for SMP rendezvous. */ static volatile int smp_rv_ncpus; static void (*volatile smp_rv_setup_func)(void *arg); static void (*volatile smp_rv_action_func)(void *arg); static void (*volatile smp_rv_teardown_func)(void *arg); static void *volatile smp_rv_func_arg; static volatile int smp_rv_waiters[4]; /* * Shared mutex to restrict busywaits between smp_rendezvous() and * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these * functions trigger at once and cause multiple CPUs to busywait with * interrupts disabled. */ struct mtx smp_ipi_mtx; /* * Let the MD SMP code initialize mp_maxid very early if it can. */ static void mp_setmaxid(void *dummy) { cpu_mp_setmaxid(); KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__)); KASSERT(mp_ncpus > 1 || mp_maxid == 0, ("%s: one CPU but mp_maxid is not zero", __func__)); KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); /* * Call the MD SMP initialization code. */ static void mp_start(void *dummy) { mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); /* Probe for MP hardware. */ if (smp_disabled != 0 || cpu_mp_probe() == 0) { mp_ncpus = 1; CPU_SETOF(PCPU_GET(cpuid), &all_cpus); return; } cpu_mp_start(); printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", mp_ncpus); cpu_mp_announce(); } SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); void forward_signal(struct thread *td) { int id; /* * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); if (!smp_started || cold || panicstr) return; if (!forward_signal_enabled) return; /* No need to IPI ourself. */ if (td == curthread) return; id = td->td_oncpu; if (id == NOCPU) return; ipi_cpu(id, IPI_AST); } /* * When called the executing CPU will send an IPI to all other CPUs * requesting that they halt execution. * * Usually (but not necessarily) called with 'other_cpus' as its arg. * * - Signals all CPUs in map to stop. * - Waits for each to stop. * * Returns: * -1: error * 0: NA * 1: ok * */ #if defined(__amd64__) || defined(__i386__) #define X86 1 #else #define X86 0 #endif static int generic_stop_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif static volatile u_int stopping_cpu = NOCPU; int i; volatile cpuset_t *cpus; KASSERT( type == IPI_STOP || type == IPI_STOP_HARD #if X86 || type == IPI_SUSPEND #endif , ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR2(KTR_SMP, "stop_cpus(%s) with %u type", cpusetobj_strprint(cpusetbuf, &map), type); #if X86 /* * When suspending, ensure there are are no IPIs in progress. * IPIs that have been issued, but not yet delivered (e.g. * not pending on a vCPU when running under virtualization) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ if (type == IPI_SUSPEND) mtx_lock_spin(&smp_ipi_mtx); #endif #if X86 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { #endif if (stopping_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&stopping_cpu, NOCPU, PCPU_GET(cpuid)) == 0) while (stopping_cpu != NOCPU) cpu_spinwait(); /* spin */ /* send the stop IPI to all CPUs in map */ ipi_selected(map, type); #if X86 } #endif #if X86 if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; i = 0; while (!CPU_SUBSET(cpus, &map)) { /* spin */ cpu_spinwait(); i++; if (i == 100000000) { printf("timeout stopping cpus\n"); break; } } #if X86 if (type == IPI_SUSPEND) mtx_unlock_spin(&smp_ipi_mtx); #endif stopping_cpu = NOCPU; return (1); } int stop_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP)); } int stop_cpus_hard(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP_HARD)); } #if X86 int suspend_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_SUSPEND)); } #endif /* * Called by a CPU to restart stopped CPUs. * * Usually (but not necessarily) called with 'stopped_cpus' as its arg. * * - Signals all CPUs in map to restart. * - Waits for each to restart. * * Returns: * -1: error * 0: NA * 1: ok */ static int generic_restart_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif volatile cpuset_t *cpus; KASSERT(type == IPI_STOP || type == IPI_STOP_HARD #if X86 || type == IPI_SUSPEND #endif , ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); #if X86 if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; /* signal other cpus to restart */ CPU_COPY_STORE_REL(&map, &started_cpus); #if X86 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { #endif /* wait for each to clear its bit */ while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); #if X86 } #endif return (1); } int restart_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_STOP)); } #if X86 int resume_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_SUSPEND)); } #endif #undef X86 /* * All-CPU rendezvous. CPUs are signalled, all execute the setup function * (if specified), rendezvous, execute the action function (if specified), * rendezvous again, execute the teardown function (if specified), and then * resume. * * Note that the supplied external functions _must_ be reentrant and aware * that they are running in parallel and in an unknown lock context. */ void smp_rendezvous_action(void) { struct thread *td; void *local_func_arg; void (*local_setup_func)(void*); void (*local_action_func)(void*); void (*local_teardown_func)(void*); #ifdef INVARIANTS int owepreempt; #endif /* Ensure we have up-to-date values. */ atomic_add_acq_int(&smp_rv_waiters[0], 1); while (smp_rv_waiters[0] < smp_rv_ncpus) cpu_spinwait(); /* Fetch rendezvous parameters after acquire barrier. */ local_func_arg = smp_rv_func_arg; local_setup_func = smp_rv_setup_func; local_action_func = smp_rv_action_func; local_teardown_func = smp_rv_teardown_func; /* * Use a nested critical section to prevent any preemptions * from occurring during a rendezvous action routine. * Specifically, if a rendezvous handler is invoked via an IPI * and the interrupted thread was in the critical_exit() * function after setting td_critnest to 0 but before * performing a deferred preemption, this routine can be * invoked with td_critnest set to 0 and td_owepreempt true. * In that case, a critical_exit() during the rendezvous * action would trigger a preemption which is not permitted in * a rendezvous action. To fix this, wrap all of the * rendezvous action handlers in a critical section. We * cannot use a regular critical section however as having * critical_exit() preempt from this routine would also be * problematic (the preemption must not occur before the IPI * has been acknowledged via an EOI). Instead, we * intentionally ignore td_owepreempt when leaving the * critical section. This should be harmless because we do * not permit rendezvous action routines to schedule threads, * and thus td_owepreempt should never transition from 0 to 1 * during this routine. */ td = curthread; td->td_critnest++; #ifdef INVARIANTS owepreempt = td->td_owepreempt; #endif /* * If requested, run a setup function before the main action * function. Ensure all CPUs have completed the setup * function before moving on to the action function. */ - if (local_setup_func != smp_no_rendevous_barrier) { + if (local_setup_func != smp_no_rendezvous_barrier) { if (smp_rv_setup_func != NULL) smp_rv_setup_func(smp_rv_func_arg); atomic_add_int(&smp_rv_waiters[1], 1); while (smp_rv_waiters[1] < smp_rv_ncpus) cpu_spinwait(); } if (local_action_func != NULL) local_action_func(local_func_arg); - if (local_teardown_func != smp_no_rendevous_barrier) { + if (local_teardown_func != smp_no_rendezvous_barrier) { /* * Signal that the main action has been completed. If a * full exit rendezvous is requested, then all CPUs will * wait here until all CPUs have finished the main action. */ atomic_add_int(&smp_rv_waiters[2], 1); while (smp_rv_waiters[2] < smp_rv_ncpus) cpu_spinwait(); if (local_teardown_func != NULL) local_teardown_func(local_func_arg); } /* * Signal that the rendezvous is fully completed by this CPU. * This means that no member of smp_rv_* pseudo-structure will be * accessed by this target CPU after this point; in particular, * memory pointed by smp_rv_func_arg. * * The release semantic ensures that all accesses performed by * the current CPU are visible when smp_rendezvous_cpus() * returns, by synchronizing with the * atomic_load_acq_int(&smp_rv_waiters[3]). */ atomic_add_rel_int(&smp_rv_waiters[3], 1); td->td_critnest--; KASSERT(owepreempt == td->td_owepreempt, ("rendezvous action changed td_owepreempt")); } void smp_rendezvous_cpus(cpuset_t map, void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { int curcpumap, i, ncpus = 0; /* Look comments in the !SMP case. */ if (!smp_started) { spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); return; } CPU_FOREACH(i) { if (CPU_ISSET(i, &map)) ncpus++; } if (ncpus == 0) panic("ncpus is 0 with non-zero map"); mtx_lock_spin(&smp_ipi_mtx); /* Pass rendezvous parameters via global variables. */ smp_rv_ncpus = ncpus; smp_rv_setup_func = setup_func; smp_rv_action_func = action_func; smp_rv_teardown_func = teardown_func; smp_rv_func_arg = arg; smp_rv_waiters[1] = 0; smp_rv_waiters[2] = 0; smp_rv_waiters[3] = 0; atomic_store_rel_int(&smp_rv_waiters[0], 0); /* * Signal other processors, which will enter the IPI with * interrupts off. */ curcpumap = CPU_ISSET(curcpu, &map); CPU_CLR(curcpu, &map); ipi_selected(map, IPI_RENDEZVOUS); /* Check if the current CPU is in the map */ if (curcpumap != 0) smp_rendezvous_action(); /* * Ensure that the master CPU waits for all the other * CPUs to finish the rendezvous, so that smp_rv_* * pseudo-structure and the arg are guaranteed to not * be in use. * * Load acquire synchronizes with the release add in * smp_rendezvous_action(), which ensures that our caller sees * all memory actions done by the called functions on other * CPUs. */ while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) cpu_spinwait(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_rendezvous(void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; struct cpu_group * smp_topo(void) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; struct cpu_group *top; /* * Check for a fake topology request for debugging purposes. */ switch (smp_topology) { case 1: /* Dual core with no sharing. */ top = smp_topo_1level(CG_SHARE_NONE, 2, 0); break; case 2: /* No topology, all cpus are equal. */ top = smp_topo_none(); break; case 3: /* Dual core with shared L2. */ top = smp_topo_1level(CG_SHARE_L2, 2, 0); break; case 4: /* quad core, shared l3 among each package, private l2. */ top = smp_topo_1level(CG_SHARE_L3, 4, 0); break; case 5: /* quad core, 2 dualcore parts on each package share l2. */ top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); break; case 6: /* Single-core 2xHTT */ top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); break; case 7: /* quad core with a shared l3, 8 threads sharing L2. */ top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, CG_FLAG_SMT); break; default: /* Default, ask the system what it wants. */ top = cpu_topo(); break; } /* * Verify the returned topology. */ if (top->cg_count != mp_ncpus) panic("Built bad topology at %p. CPU count %d != %d", top, top->cg_count, mp_ncpus); if (CPU_CMP(&top->cg_mask, &all_cpus)) panic("Built bad topology at %p. CPU mask (%s) != (%s)", top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), cpusetobj_strprint(cpusetbuf2, &all_cpus)); return (top); } struct cpu_group * smp_topo_alloc(u_int count) { static u_int index; u_int curr; curr = index; index += count; return (&group[curr]); } struct cpu_group * smp_topo_none(void) { struct cpu_group *top; top = &group[0]; top->cg_parent = NULL; top->cg_child = NULL; top->cg_mask = all_cpus; top->cg_count = mp_ncpus; top->cg_children = 0; top->cg_level = CG_SHARE_NONE; top->cg_flags = 0; return (top); } static int smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, int count, int flags, int start) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; cpuset_t mask; int i; CPU_ZERO(&mask); for (i = 0; i < count; i++, start++) CPU_SET(start, &mask); child->cg_parent = parent; child->cg_child = NULL; child->cg_children = 0; child->cg_level = share; child->cg_count = count; child->cg_flags = flags; child->cg_mask = mask; parent->cg_children++; for (; parent != NULL; parent = parent->cg_parent) { if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) panic("Duplicate children in %p. mask (%s) child (%s)", parent, cpusetobj_strprint(cpusetbuf, &parent->cg_mask), cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); CPU_OR(&parent->cg_mask, &child->cg_mask); parent->cg_count += child->cg_count; } return (start); } struct cpu_group * smp_topo_1level(int share, int count, int flags) { struct cpu_group *child; struct cpu_group *top; int packages; int cpu; int i; cpu = 0; top = &group[0]; packages = mp_ncpus / count; top->cg_child = child = &group[1]; top->cg_level = CG_SHARE_NONE; for (i = 0; i < packages; i++, child++) cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); return (top); } struct cpu_group * smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags) { struct cpu_group *top; struct cpu_group *l1g; struct cpu_group *l2g; int cpu; int i; int j; cpu = 0; top = &group[0]; l2g = &group[1]; top->cg_child = l2g; top->cg_level = CG_SHARE_NONE; top->cg_children = mp_ncpus / (l2count * l1count); l1g = l2g + top->cg_children; for (i = 0; i < top->cg_children; i++, l2g++) { l2g->cg_parent = top; l2g->cg_child = l1g; l2g->cg_level = l2share; for (j = 0; j < l2count; j++, l1g++) cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, l1flags, cpu); } return (top); } struct cpu_group * smp_topo_find(struct cpu_group *top, int cpu) { struct cpu_group *cg; cpuset_t mask; int children; int i; CPU_SETOF(cpu, &mask); cg = top; for (;;) { if (!CPU_OVERLAP(&cg->cg_mask, &mask)) return (NULL); if (cg->cg_children == 0) return (cg); children = cg->cg_children; for (i = 0, cg = cg->cg_child; i < children; cg++, i++) if (CPU_OVERLAP(&cg->cg_mask, &mask)) break; } return (NULL); } #else /* !SMP */ void smp_rendezvous_cpus(cpuset_t map, void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* * In the !SMP case we just need to ensure the same initial conditions * as the SMP case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } void smp_rendezvous(void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* Look comments in the smp_rendezvous_cpus() case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } /* * Provide dummy SMP support for UP kernels. Modules that need to use SMP * APIs will still work using this dummy support. */ static void mp_setvariables_for_up(void *dummy) { mp_ncpus = 1; mp_maxid = PCPU_GET(cpuid); CPU_SETOF(mp_maxid, &all_cpus); KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); } SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setvariables_for_up, NULL); #endif /* SMP */ +/* + * smp_no_rendevous_barrier was renamed to smp_no_rendezvous_barrier + * in __FreeBSD_version 1101508, with the old name remaining in 11.x + * as an alias for compatibility. The old name will be gone in 12.0 + * (__FreeBSD_version >= 1200028). + */ +__strong_reference(smp_no_rendezvous_barrier, smp_no_rendevous_barrier); void -smp_no_rendevous_barrier(void *dummy) +smp_no_rendezvous_barrier(void *dummy) { #ifdef SMP - KASSERT((!smp_started),("smp_no_rendevous called and smp is started")); + KASSERT((!smp_started),("smp_no_rendezvous called and smp is started")); #endif } /* * Wait specified idle threads to switch once. This ensures that even * preempted threads have cycled through the switch function once, * exiting their codepaths. This allows us to change global pointers * with no other synchronization. */ int quiesce_cpus(cpuset_t map, const char *wmesg, int prio) { struct pcpu *pcpu; u_int gen[MAXCPU]; int error; int cpu; error = 0; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); gen[cpu] = pcpu->pc_idlethread->td_generation; } for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); while (gen[cpu] == pcpu->pc_idlethread->td_generation) { error = tsleep(quiesce_cpus, prio, wmesg, 1); if (error != EWOULDBLOCK) goto out; error = 0; } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (error); } int quiesce_all_cpus(const char *wmesg, int prio) { return quiesce_cpus(all_cpus, wmesg, prio); } /* Extra care is taken with this sysctl because the data type is volatile */ static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) { int error, active; active = smp_started; error = SYSCTL_OUT(req, &active, sizeof(active)); return (error); } #ifdef SMP void topo_init_node(struct topo_node *node) { bzero(node, sizeof(*node)); TAILQ_INIT(&node->children); } void topo_init_root(struct topo_node *root) { topo_init_node(root); root->type = TOPO_TYPE_SYSTEM; } /* * Add a child node with the given ID under the given parent. * Do nothing if there is already a child with that ID. */ struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype) { struct topo_node *node; TAILQ_FOREACH_REVERSE(node, &parent->children, topo_children, siblings) { if (node->hwid == hwid && node->type == type && node->subtype == subtype) { return (node); } } node = malloc(sizeof(*node), M_TOPO, M_WAITOK); topo_init_node(node); node->parent = parent; node->hwid = hwid; node->type = type; node->subtype = subtype; TAILQ_INSERT_TAIL(&parent->children, node, siblings); parent->nchildren++; return (node); } /* * Find a child node with the given ID under the given parent. */ struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype) { struct topo_node *node; TAILQ_FOREACH(node, &parent->children, siblings) { if (node->hwid == hwid && node->type == type && node->subtype == subtype) { return (node); } } return (NULL); } /* * Given a node change the order of its parent's child nodes such * that the node becomes the firt child while preserving the cyclic * order of the children. In other words, the given node is promoted * by rotation. */ void topo_promote_child(struct topo_node *child) { struct topo_node *next; struct topo_node *node; struct topo_node *parent; parent = child->parent; next = TAILQ_NEXT(child, siblings); TAILQ_REMOVE(&parent->children, child, siblings); TAILQ_INSERT_HEAD(&parent->children, child, siblings); while (next != NULL) { node = next; next = TAILQ_NEXT(node, siblings); TAILQ_REMOVE(&parent->children, node, siblings); TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); child = node; } } /* * Iterate to the next node in the depth-first search (traversal) of * the topology tree. */ struct topo_node * topo_next_node(struct topo_node *top, struct topo_node *node) { struct topo_node *next; if ((next = TAILQ_FIRST(&node->children)) != NULL) return (next); if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); while ((node = node->parent) != top) if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); return (NULL); } /* * Iterate to the next node in the depth-first search of the topology tree, * but without descending below the current node. */ struct topo_node * topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) { struct topo_node *next; if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); while ((node = node->parent) != top) if ((next = TAILQ_NEXT(node, siblings)) != NULL) return (next); return (NULL); } /* * Assign the given ID to the given topology node that represents a logical * processor. */ void topo_set_pu_id(struct topo_node *node, cpuid_t id) { KASSERT(node->type == TOPO_TYPE_PU, ("topo_set_pu_id: wrong node type: %u", node->type)); KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, ("topo_set_pu_id: cpuset already not empty")); node->id = id; CPU_SET(id, &node->cpuset); node->cpu_count = 1; node->subtype = 1; while ((node = node->parent) != NULL) { KASSERT(!CPU_ISSET(id, &node->cpuset), ("logical ID %u is already set in node %p", id, node)); CPU_SET(id, &node->cpuset); node->cpu_count++; } } /* * Check if the topology is uniform, that is, each package has the same number * of cores in it and each core has the same number of threads (logical * processors) in it. If so, calculate the number of package, the number of * cores per package and the number of logical processors per core. * 'all' parameter tells whether to include administratively disabled logical * processors into the analysis. */ int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count, int *cores_per_pkg, int *thrs_per_core) { struct topo_node *pkg_node; struct topo_node *core_node; struct topo_node *pu_node; int thrs_per_pkg; int cpp_counter; int tpc_counter; int tpp_counter; *pkg_count = 0; *cores_per_pkg = -1; *thrs_per_core = -1; thrs_per_pkg = -1; pkg_node = topo_root; while (pkg_node != NULL) { if (pkg_node->type != TOPO_TYPE_PKG) { pkg_node = topo_next_node(topo_root, pkg_node); continue; } if (!all && CPU_EMPTY(&pkg_node->cpuset)) { pkg_node = topo_next_nonchild_node(topo_root, pkg_node); continue; } (*pkg_count)++; cpp_counter = 0; tpp_counter = 0; core_node = pkg_node; while (core_node != NULL) { if (core_node->type == TOPO_TYPE_CORE) { if (!all && CPU_EMPTY(&core_node->cpuset)) { core_node = topo_next_nonchild_node(pkg_node, core_node); continue; } cpp_counter++; tpc_counter = 0; pu_node = core_node; while (pu_node != NULL) { if (pu_node->type == TOPO_TYPE_PU && (all || !CPU_EMPTY(&pu_node->cpuset))) tpc_counter++; pu_node = topo_next_node(core_node, pu_node); } if (*thrs_per_core == -1) *thrs_per_core = tpc_counter; else if (*thrs_per_core != tpc_counter) return (0); core_node = topo_next_nonchild_node(pkg_node, core_node); } else { /* PU node directly under PKG. */ if (core_node->type == TOPO_TYPE_PU && (all || !CPU_EMPTY(&core_node->cpuset))) tpp_counter++; core_node = topo_next_node(pkg_node, core_node); } } if (*cores_per_pkg == -1) *cores_per_pkg = cpp_counter; else if (*cores_per_pkg != cpp_counter) return (0); if (thrs_per_pkg == -1) thrs_per_pkg = tpp_counter; else if (thrs_per_pkg != tpp_counter) return (0); pkg_node = topo_next_nonchild_node(topo_root, pkg_node); } KASSERT(*pkg_count > 0, ("bug in topology or analysis")); if (*cores_per_pkg == 0) { KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0, ("bug in topology or analysis")); *thrs_per_core = thrs_per_pkg; } return (1); } #endif /* SMP */ Index: stable/11/sys/mips/include/counter.h =================================================================== --- stable/11/sys/mips/include/counter.h (revision 328385) +++ stable/11/sys/mips/include/counter.h (revision 328386) @@ -1,94 +1,94 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C /* XXXKIB non-atomic 64bit read on 32bit */ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB non-atomic 64bit store on 32bit, might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: stable/11/sys/powerpc/include/counter.h =================================================================== --- stable/11/sys/powerpc/include/counter.h (revision 328385) +++ stable/11/sys/powerpc/include/counter.h (revision 328386) @@ -1,162 +1,162 @@ /*- * Copyright (c) 2012, 2013 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif #ifdef __powerpc64__ #define counter_enter() do {} while (0) #define counter_exit() do {} while (0) #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; CPU_FOREACH(i) r += counter_u64_read_one((uint64_t *)p, i); return (r); } static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, i) counter_u64_add(c, i) extern struct pcpu __pcpu[MAXCPU]; static inline void counter_u64_add(counter_u64_t c, int64_t inc) { uint64_t ccpu, old; __asm __volatile("\n" "1:\n\t" "mfsprg %0, 0\n\t" "ldarx %1, %0, %2\n\t" "add %1, %1, %3\n\t" "stdcx. %1, %0, %2\n\t" "bne- 1b" : "=&b" (ccpu), "=&r" (old) : "r" ((char *)c - (char *)&__pcpu[0]), "r" (inc) : "cr0", "memory"); } #else /* !64bit */ #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C /* XXXKIB non-atomic 64bit read */ static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB non-atomic 64bit store, might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* 64bit */ #endif /* ! __MACHINE_COUNTER_H__ */ Index: stable/11/sys/powerpc/ofw/ofw_machdep.c =================================================================== --- stable/11/sys/powerpc/ofw/ofw_machdep.c (revision 328385) +++ stable/11/sys/powerpc/ofw/ofw_machdep.c (revision 328386) @@ -1,606 +1,606 @@ /*- * Copyright (C) 1996 Wolfgang Solfrank. * Copyright (C) 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: ofw_machdep.c,v 1.5 2000/05/23 13:25:43 tsubai Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_platform.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void *fdt; int ofw_real_mode; #ifdef AIM extern register_t ofmsr[5]; extern void *openfirmware_entry; char save_trap_init[0x2f00]; /* EXC_LAST */ char save_trap_of[0x2f00]; /* EXC_LAST */ int ofwcall(void *); static int openfirmware(void *args); __inline void ofw_save_trap_vec(char *save_trap_vec) { if (!ofw_real_mode) return; bcopy((void *)EXC_RST, save_trap_vec, EXC_LAST - EXC_RST); } static __inline void ofw_restore_trap_vec(char *restore_trap_vec) { if (!ofw_real_mode) return; bcopy(restore_trap_vec, (void *)EXC_RST, EXC_LAST - EXC_RST); __syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD); } /* * Saved SPRG0-3 from OpenFirmware. Will be restored prior to the callback. */ register_t ofw_sprg0_save; static __inline void ofw_sprg_prepare(void) { if (ofw_real_mode) return; /* * Assume that interrupt are disabled at this point, or * SPRG1-3 could be trashed */ #ifdef __powerpc64__ __asm __volatile("mtsprg1 %0\n\t" "mtsprg2 %1\n\t" "mtsprg3 %2\n\t" : : "r"(ofmsr[2]), "r"(ofmsr[3]), "r"(ofmsr[4])); #else __asm __volatile("mfsprg0 %0\n\t" "mtsprg0 %1\n\t" "mtsprg1 %2\n\t" "mtsprg2 %3\n\t" "mtsprg3 %4\n\t" : "=&r"(ofw_sprg0_save) : "r"(ofmsr[1]), "r"(ofmsr[2]), "r"(ofmsr[3]), "r"(ofmsr[4])); #endif } static __inline void ofw_sprg_restore(void) { if (ofw_real_mode) return; /* * Note that SPRG1-3 contents are irrelevant. They are scratch * registers used in the early portion of trap handling when * interrupts are disabled. * * PCPU data cannot be used until this routine is called ! */ #ifndef __powerpc64__ __asm __volatile("mtsprg0 %0" :: "r"(ofw_sprg0_save)); #endif } #endif static int parse_ofw_memory(phandle_t node, const char *prop, struct mem_region *output) { cell_t address_cells, size_cells; cell_t OFmem[4 * PHYS_AVAIL_SZ]; int sz, i, j; phandle_t phandle; sz = 0; /* * Get #address-cells from root node, defaulting to 1 if it cannot * be found. */ phandle = OF_finddevice("/"); if (OF_getencprop(phandle, "#address-cells", &address_cells, sizeof(address_cells)) < (ssize_t)sizeof(address_cells)) address_cells = 1; if (OF_getencprop(phandle, "#size-cells", &size_cells, sizeof(size_cells)) < (ssize_t)sizeof(size_cells)) size_cells = 1; /* * Get memory. */ if (node == -1 || (sz = OF_getencprop(node, prop, OFmem, sizeof(OFmem))) <= 0) panic("Physical memory map not found"); i = 0; j = 0; while (i < sz/sizeof(cell_t)) { #if !defined(__powerpc64__) && !defined(BOOKE) /* On 32-bit PPC (OEA), ignore regions starting above 4 GB */ if (address_cells > 1 && OFmem[i] > 0) { i += address_cells + size_cells; continue; } #endif output[j].mr_start = OFmem[i++]; if (address_cells == 2) { output[j].mr_start <<= 32; output[j].mr_start += OFmem[i++]; } output[j].mr_size = OFmem[i++]; if (size_cells == 2) { output[j].mr_size <<= 32; output[j].mr_size += OFmem[i++]; } #if !defined(__powerpc64__) && !defined(BOOKE) /* Book-E can support 36-bit addresses. */ /* * Check for memory regions extending above 32-bit * memory space, and restrict them to stay there. */ if (((uint64_t)output[j].mr_start + (uint64_t)output[j].mr_size) > BUS_SPACE_MAXADDR_32BIT) { output[j].mr_size = BUS_SPACE_MAXADDR_32BIT - output[j].mr_start; } #endif j++; } sz = j*sizeof(output[0]); return (sz); } static int excise_fdt_reserved(struct mem_region *avail, int asz) { struct { uint64_t address; uint64_t size; } fdtmap[16]; ssize_t fdtmapsize; phandle_t chosen; int i, j, k; chosen = OF_finddevice("/chosen"); fdtmapsize = OF_getprop(chosen, "fdtmemreserv", fdtmap, sizeof(fdtmap)); for (j = 0; j < fdtmapsize/sizeof(fdtmap[0]); j++) { fdtmap[j].address = be64toh(fdtmap[j].address); fdtmap[j].size = be64toh(fdtmap[j].size); } for (i = 0; i < asz; i++) { for (j = 0; j < fdtmapsize/sizeof(fdtmap[0]); j++) { /* * Case 1: Exclusion region encloses complete * available entry. Drop it and move on. */ if (fdtmap[j].address <= avail[i].mr_start && fdtmap[j].address + fdtmap[j].size >= avail[i].mr_start + avail[i].mr_size) { for (k = i+1; k < asz; k++) avail[k-1] = avail[k]; asz--; i--; /* Repeat some entries */ continue; } /* * Case 2: Exclusion region starts in available entry. * Trim it to where the entry begins and append * a new available entry with the region after * the excluded region, if any. */ if (fdtmap[j].address >= avail[i].mr_start && fdtmap[j].address < avail[i].mr_start + avail[i].mr_size) { if (fdtmap[j].address + fdtmap[j].size < avail[i].mr_start + avail[i].mr_size) { avail[asz].mr_start = fdtmap[j].address + fdtmap[j].size; avail[asz].mr_size = avail[i].mr_start + avail[i].mr_size - avail[asz].mr_start; asz++; } avail[i].mr_size = fdtmap[j].address - avail[i].mr_start; } /* * Case 3: Exclusion region ends in available entry. * Move start point to where the exclusion zone ends. * The case of a contained exclusion zone has already * been caught in case 2. */ if (fdtmap[j].address + fdtmap[j].size >= avail[i].mr_start && fdtmap[j].address + fdtmap[j].size < avail[i].mr_start + avail[i].mr_size) { avail[i].mr_size += avail[i].mr_start; avail[i].mr_start = fdtmap[j].address + fdtmap[j].size; avail[i].mr_size -= avail[i].mr_start; } } } return (asz); } /* * This is called during powerpc_init, before the system is really initialized. * It shall provide the total and the available regions of RAM. * The available regions need not take the kernel into account. */ void ofw_mem_regions(struct mem_region *memp, int *memsz, struct mem_region *availp, int *availsz) { phandle_t phandle; int asz, msz; int res; char name[31]; asz = msz = 0; /* * Get memory from all the /memory nodes. */ for (phandle = OF_child(OF_peer(0)); phandle != 0; phandle = OF_peer(phandle)) { if (OF_getprop(phandle, "name", name, sizeof(name)) <= 0) continue; if (strncmp(name, "memory", sizeof(name)) != 0 && strncmp(name, "memory@", strlen("memory@")) != 0) continue; res = parse_ofw_memory(phandle, "reg", &memp[msz]); msz += res/sizeof(struct mem_region); if (OF_getproplen(phandle, "available") >= 0) res = parse_ofw_memory(phandle, "available", &availp[asz]); else res = parse_ofw_memory(phandle, "reg", &availp[asz]); asz += res/sizeof(struct mem_region); } phandle = OF_finddevice("/chosen"); if (OF_hasprop(phandle, "fdtmemreserv")) asz = excise_fdt_reserved(availp, asz); *memsz = msz; *availsz = asz; } void OF_initial_setup(void *fdt_ptr, void *junk, int (*openfirm)(void *)) { #ifdef AIM ofmsr[0] = mfmsr(); #ifdef __powerpc64__ ofmsr[0] &= ~PSL_SF; #else __asm __volatile("mfsprg0 %0" : "=&r"(ofmsr[1])); #endif __asm __volatile("mfsprg1 %0" : "=&r"(ofmsr[2])); __asm __volatile("mfsprg2 %0" : "=&r"(ofmsr[3])); __asm __volatile("mfsprg3 %0" : "=&r"(ofmsr[4])); openfirmware_entry = openfirm; if (ofmsr[0] & PSL_DR) ofw_real_mode = 0; else ofw_real_mode = 1; ofw_save_trap_vec(save_trap_init); #else ofw_real_mode = 1; #endif fdt = fdt_ptr; #ifdef FDT_DTB_STATIC /* Check for a statically included blob */ if (fdt == NULL) fdt = &fdt_static_dtb; #endif } boolean_t OF_bootstrap() { boolean_t status = FALSE; #ifdef AIM if (openfirmware_entry != NULL) { if (ofw_real_mode) { status = OF_install(OFW_STD_REAL, 0); } else { #ifdef __powerpc64__ status = OF_install(OFW_STD_32BIT, 0); #else status = OF_install(OFW_STD_DIRECT, 0); #endif } if (status != TRUE) return status; OF_init(openfirmware); } else #endif if (fdt != NULL) { status = OF_install(OFW_FDT, 0); if (status != TRUE) return status; OF_init(fdt); OF_interpret("perform-fixup", 0); } return (status); } #ifdef AIM void ofw_quiesce(void) { struct { cell_t name; cell_t nargs; cell_t nreturns; } args; KASSERT(!pmap_bootstrapped, ("Cannot call ofw_quiesce after VM is up")); args.name = (cell_t)(uintptr_t)"quiesce"; args.nargs = 0; args.nreturns = 0; openfirmware(&args); } static int openfirmware_core(void *args) { int result; register_t oldmsr; if (openfirmware_entry == NULL) return (-1); /* * Turn off exceptions - we really don't want to end up * anywhere unexpected with PCPU set to something strange * or the stack pointer wrong. */ oldmsr = intr_disable(); ofw_sprg_prepare(); /* Save trap vectors */ ofw_save_trap_vec(save_trap_of); /* Restore initially saved trap vectors */ ofw_restore_trap_vec(save_trap_init); #if defined(AIM) && !defined(__powerpc64__) /* * Clear battable[] translations */ if (!(cpu_features & PPC_FEATURE_64)) __asm __volatile("mtdbatu 2, %0\n" "mtdbatu 3, %0" : : "r" (0)); isync(); #endif result = ofwcall(args); /* Restore trap vecotrs */ ofw_restore_trap_vec(save_trap_of); ofw_sprg_restore(); intr_restore(oldmsr); return (result); } #ifdef SMP struct ofw_rv_args { void *args; int retval; volatile int in_progress; }; static void ofw_rendezvous_dispatch(void *xargs) { struct ofw_rv_args *rv_args = xargs; /* NOTE: Interrupts are disabled here */ if (PCPU_GET(cpuid) == 0) { /* * Execute all OF calls on CPU 0 */ rv_args->retval = openfirmware_core(rv_args->args); rv_args->in_progress = 0; } else { /* * Spin with interrupts off on other CPUs while OF has * control of the machine. */ while (rv_args->in_progress) cpu_spinwait(); } } #endif static int openfirmware(void *args) { int result; #ifdef SMP struct ofw_rv_args rv_args; #endif if (openfirmware_entry == NULL) return (-1); #ifdef SMP rv_args.args = args; rv_args.in_progress = 1; - smp_rendezvous(smp_no_rendevous_barrier, ofw_rendezvous_dispatch, - smp_no_rendevous_barrier, &rv_args); + smp_rendezvous(smp_no_rendezvous_barrier, ofw_rendezvous_dispatch, + smp_no_rendezvous_barrier, &rv_args); result = rv_args.retval; #else result = openfirmware_core(args); #endif return (result); } void OF_reboot() { struct { cell_t name; cell_t nargs; cell_t nreturns; cell_t arg; } args; args.name = (cell_t)(uintptr_t)"interpret"; args.nargs = 1; args.nreturns = 0; args.arg = (cell_t)(uintptr_t)"reset-all"; openfirmware_core(&args); /* Don't do rendezvous! */ for (;;); /* just in case */ } #endif /* AIM */ void OF_getetheraddr(device_t dev, u_char *addr) { phandle_t node; node = ofw_bus_get_node(dev); OF_getprop(node, "local-mac-address", addr, ETHER_ADDR_LEN); } /* * Return a bus handle and bus tag that corresponds to the register * numbered regno for the device referenced by the package handle * dev. This function is intended to be used by console drivers in * early boot only. It works by mapping the address of the device's * register in the address space of its parent and recursively walk * the device tree upward this way. */ int OF_decode_addr(phandle_t dev, int regno, bus_space_tag_t *tag, bus_space_handle_t *handle, bus_size_t *sz) { bus_addr_t addr; bus_size_t size; pcell_t pci_hi; int flags, res; res = ofw_reg_to_paddr(dev, regno, &addr, &size, &pci_hi); if (res < 0) return (res); if (pci_hi == OFW_PADDR_NOT_PCI) { *tag = &bs_be_tag; flags = 0; } else { *tag = &bs_le_tag; flags = (pci_hi & OFW_PCI_PHYS_HI_PREFETCHABLE) ? BUS_SPACE_MAP_PREFETCHABLE: 0; } if (sz != NULL) *sz = size; return (bus_space_map(*tag, addr, size, flags, handle)); } Index: stable/11/sys/riscv/include/counter.h =================================================================== --- stable/11/sys/riscv/include/counter.h (revision 328385) +++ stable/11/sys/riscv/include/counter.h (revision 328386) @@ -1,93 +1,93 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_COUNTER_H_ #define _MACHINE_COUNTER_H_ #include #ifdef INVARIANTS #include #endif #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* ! _MACHINE_COUNTER_H_ */ Index: stable/11/sys/sparc64/include/counter.h =================================================================== --- stable/11/sys/sparc64/include/counter.h (revision 328385) +++ stable/11/sys/sparc64/include/counter.h (revision 328386) @@ -1,93 +1,93 @@ /*- * Copyright (c) 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_COUNTER_H__ #define __MACHINE_COUNTER_H__ #include #ifdef INVARIANTS #include #endif #define counter_enter() critical_enter() #define counter_exit() critical_exit() #ifdef IN_SUBR_COUNTER_C static inline uint64_t counter_u64_read_one(uint64_t *p, int cpu) { return (*(uint64_t *)((char *)p + sizeof(struct pcpu) * cpu)); } static inline uint64_t counter_u64_fetch_inline(uint64_t *p) { uint64_t r; int i; r = 0; for (i = 0; i < mp_ncpus; i++) r += counter_u64_read_one((uint64_t *)p, i); return (r); } /* XXXKIB might interrupt increment */ static void counter_u64_zero_one_cpu(void *arg) { *((uint64_t *)((char *)arg + sizeof(struct pcpu) * PCPU_GET(cpuid))) = 0; } static inline void counter_u64_zero_inline(counter_u64_t c) { - smp_rendezvous(smp_no_rendevous_barrier, counter_u64_zero_one_cpu, - smp_no_rendevous_barrier, c); + smp_rendezvous(smp_no_rendezvous_barrier, counter_u64_zero_one_cpu, + smp_no_rendezvous_barrier, c); } #endif #define counter_u64_add_protected(c, inc) do { \ CRITICAL_ASSERT(curthread); \ *(uint64_t *)zpcpu_get(c) += (inc); \ } while (0) static inline void counter_u64_add(counter_u64_t c, int64_t inc) { counter_enter(); counter_u64_add_protected(c, inc); counter_exit(); } #endif /* ! __MACHINE_COUNTER_H__ */ Index: stable/11/sys/sys/param.h =================================================================== --- stable/11/sys/sys/param.h (revision 328385) +++ stable/11/sys/sys/param.h (revision 328386) @@ -1,365 +1,365 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1101507 /* Master, propagated to newvers */ +#define __FreeBSD_version 1101508 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_MAP_GUARD_11 1101501 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_WRFSBASE_11 1101503 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef lint #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* lint */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: stable/11/sys/sys/smp.h =================================================================== --- stable/11/sys/sys/smp.h (revision 328385) +++ stable/11/sys/sys/smp.h (revision 328386) @@ -1,255 +1,262 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ */ #ifndef _SYS_SMP_H_ #define _SYS_SMP_H_ #ifdef _KERNEL #ifndef LOCORE #include #include /* * Types of nodes in the topological tree. */ typedef enum { /* No node has this type; can be used in topo API calls. */ TOPO_TYPE_DUMMY, /* Processing unit aka computing unit aka logical CPU. */ TOPO_TYPE_PU, /* Physical subdivision of a package. */ TOPO_TYPE_CORE, /* CPU L1/L2/L3 cache. */ TOPO_TYPE_CACHE, /* Package aka chip, equivalent to socket. */ TOPO_TYPE_PKG, /* NUMA node. */ TOPO_TYPE_NODE, /* Other logical or physical grouping of PUs. */ /* E.g. PUs on the same dye, or PUs sharing an FPU. */ TOPO_TYPE_GROUP, /* The whole system. */ TOPO_TYPE_SYSTEM } topo_node_type; /* Hardware indenitifier of a topology component. */ typedef unsigned int hwid_t; /* Logical CPU idenitifier. */ typedef int cpuid_t; /* A node in the topology. */ struct topo_node { struct topo_node *parent; TAILQ_HEAD(topo_children, topo_node) children; TAILQ_ENTRY(topo_node) siblings; cpuset_t cpuset; topo_node_type type; uintptr_t subtype; hwid_t hwid; cpuid_t id; int nchildren; int cpu_count; }; /* * Scheduling topology of a NUMA or SMP system. * * The top level topology is an array of pointers to groups. Each group * contains a bitmask of cpus in its group or subgroups. It may also * contain a pointer to an array of child groups. * * The bitmasks at non leaf groups may be used by consumers who support * a smaller depth than the hardware provides. * * The topology may be omitted by systems where all CPUs are equal. */ struct cpu_group { struct cpu_group *cg_parent; /* Our parent group. */ struct cpu_group *cg_child; /* Optional children groups. */ cpuset_t cg_mask; /* Mask of cpus in this group. */ int32_t cg_count; /* Count of cpus in this group. */ int16_t cg_children; /* Number of children groups. */ int8_t cg_level; /* Shared cache level. */ int8_t cg_flags; /* Traversal modifiers. */ }; typedef struct cpu_group *cpu_group_t; /* * Defines common resources for CPUs in the group. The highest level * resource should be used when multiple are shared. */ #define CG_SHARE_NONE 0 #define CG_SHARE_L1 1 #define CG_SHARE_L2 2 #define CG_SHARE_L3 3 #define MAX_CACHE_LEVELS CG_SHARE_L3 /* * Behavior modifiers for load balancing and affinity. */ #define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */ #define CG_FLAG_SMT 0x02 /* New age htt, less crippled. */ #define CG_FLAG_THREAD (CG_FLAG_HTT | CG_FLAG_SMT) /* Any threading. */ /* * Convenience routines for building and traversing topologies. */ #ifdef SMP void topo_init_node(struct topo_node *node); void topo_init_root(struct topo_node *root); struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype); struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype); void topo_promote_child(struct topo_node *child); struct topo_node * topo_next_node(struct topo_node *top, struct topo_node *node); struct topo_node * topo_next_nonchild_node(struct topo_node *top, struct topo_node *node); void topo_set_pu_id(struct topo_node *node, cpuid_t id); int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count, int *cores_per_pkg, int *thrs_per_core); #define TOPO_FOREACH(i, root) \ for (i = root; i != NULL; i = topo_next_node(root, i)) struct cpu_group *smp_topo(void); struct cpu_group *smp_topo_alloc(u_int count); struct cpu_group *smp_topo_none(void); struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu); extern void (*cpustop_restartfunc)(void); extern int smp_cpus; extern volatile cpuset_t started_cpus; extern volatile cpuset_t stopped_cpus; extern volatile cpuset_t suspended_cpus; extern cpuset_t hlt_cpus_mask; extern cpuset_t logical_cpus_mask; #endif /* SMP */ extern u_int mp_maxid; extern int mp_maxcpus; extern int mp_ncpus; extern volatile int smp_started; extern cpuset_t all_cpus; extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */ /* * Macro allowing us to determine whether a CPU is absent at any given * time, thus permitting us to configure sparse maps of cpuid-dependent * (per-CPU) structures. */ #define CPU_ABSENT(x_cpu) (!CPU_ISSET(x_cpu, &all_cpus)) /* * Macros to iterate over non-absent CPUs. CPU_FOREACH() takes an * integer iterator and iterates over the available set of CPUs. * CPU_FIRST() returns the id of the first non-absent CPU. CPU_NEXT() * returns the id of the next non-absent CPU. It will wrap back to * CPU_FIRST() once the end of the list is reached. The iterators are * currently implemented via inline functions. */ #define CPU_FOREACH(i) \ for ((i) = 0; (i) <= mp_maxid; (i)++) \ if (!CPU_ABSENT((i))) static __inline int cpu_first(void) { int i; for (i = 0;; i++) if (!CPU_ABSENT(i)) return (i); } static __inline int cpu_next(int i) { for (;;) { i++; if (i > mp_maxid) i = 0; if (!CPU_ABSENT(i)) return (i); } } #define CPU_FIRST() cpu_first() #define CPU_NEXT(i) cpu_next((i)) #ifdef SMP /* * Machine dependent functions used to initialize MP support. * * The cpu_mp_probe() should check to see if MP support is present and return * zero if it is not or non-zero if it is. If MP support is present, then * cpu_mp_start() will be called so that MP can be enabled. This function * should do things such as startup secondary processors. It should also * setup mp_ncpus, all_cpus, and smp_cpus. It should also ensure that * smp_started is initialized at the appropriate time. * Once cpu_mp_start() returns, machine independent MP startup code will be * executed and a simple message will be output to the console. Finally, * cpu_mp_announce() will be called so that machine dependent messages about * the MP support may be output to the console if desired. * * The cpu_setmaxid() function is called very early during the boot process * so that the MD code may set mp_maxid to provide an upper bound on CPU IDs * that other subsystems may use. If a platform is not able to determine * the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1. */ struct thread; struct cpu_group *cpu_topo(void); void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); void forward_signal(struct thread *); int restart_cpus(cpuset_t); int stop_cpus(cpuset_t); int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); #endif void smp_rendezvous_action(void); extern struct mtx smp_ipi_mtx; #endif /* SMP */ int quiesce_all_cpus(const char *, int); int quiesce_cpus(cpuset_t, const char *, int); +/* + * smp_no_rendevous_barrier was renamed to smp_no_rendezvous_barrier + * in __FreeBSD_version 1101508, with the old name remaining in 11.x + * as an alias for compatibility. The old name will be gone in 12.0 + * (__FreeBSD_version >= 1200028). + */ void smp_no_rendevous_barrier(void *); +void smp_no_rendezvous_barrier(void *); void smp_rendezvous(void (*)(void *), void (*)(void *), void (*)(void *), void *arg); void smp_rendezvous_cpus(cpuset_t, void (*)(void *), void (*)(void *), void (*)(void *), void *arg); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* _SYS_SMP_H_ */ Index: stable/11/sys/x86/x86/tsc.c =================================================================== --- stable/11/sys/x86/x86/tsc.c (revision 328385) +++ stable/11/sys/x86/x86/tsc.c (revision 328386) @@ -1,759 +1,759 @@ /*- * Copyright (c) 1998-2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_clock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" uint64_t tsc_freq; int tsc_is_invariant; int tsc_perf_stat; static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag; SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); #ifdef SMP int smp_tsc; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0, "Indicates whether the TSC is safe to use in SMP mode"); int smp_tsc_adjust = 0; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN, &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP"); #endif static int tsc_shift = 1; SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN, &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency"); static int tsc_disabled; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, "Disable x86 Time Stamp Counter"); static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, "Disable TSC frequency calibration"); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status); static unsigned tsc_get_timecount(struct timecounter *tc); static inline unsigned tsc_get_timecount_low(struct timecounter *tc); static unsigned tsc_get_timecount_lfence(struct timecounter *tc); static unsigned tsc_get_timecount_low_lfence(struct timecounter *tc); static unsigned tsc_get_timecount_mfence(struct timecounter *tc); static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc); static void tsc_levels_changed(void *arg, int unit); static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc); #endif static struct timecounter tsc_timecounter = { .tc_get_timecount = tsc_get_timecount, .tc_counter_mask = ~0u, .tc_name = "TSC", .tc_quality = 800, /* adjusted in code */ .tc_fill_vdso_timehands = x86_tsc_vdso_timehands, #ifdef COMPAT_FREEBSD32 .tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32, #endif }; static void tsc_freq_vmware(void) { u_int regs[4]; if (hv_high >= 0x40000010) { do_cpuid(0x40000010, regs); tsc_freq = regs[0] * 1000; } else { vmware_hvcall(VMW_HVCMD_GETHZ, regs); if (regs[1] != UINT_MAX) tsc_freq = regs[0] | ((uint64_t)regs[1] << 32); } tsc_is_invariant = 1; } static void tsc_freq_intel(void) { char brand[48]; u_int regs[4]; uint64_t freq; char *p; u_int i; /* * Intel Processor Identification and the CPUID Instruction * Application Note 485. * http://www.intel.com/assets/pdf/appnote/241618.pdf */ if (cpu_exthigh >= 0x80000004) { p = brand; for (i = 0x80000002; i < 0x80000005; i++) { do_cpuid(i, regs); memcpy(p, regs, sizeof(regs)); p += sizeof(regs); } p = NULL; for (i = 0; i < sizeof(brand) - 1; i++) if (brand[i] == 'H' && brand[i + 1] == 'z') p = brand + i; if (p != NULL) { p -= 5; switch (p[4]) { case 'M': i = 1; break; case 'G': i = 1000; break; case 'T': i = 1000000; break; default: return; } #define C2D(c) ((c) - '0') if (p[1] == '.') { freq = C2D(p[0]) * 1000; freq += C2D(p[2]) * 100; freq += C2D(p[3]) * 10; freq *= i * 1000; } else { freq = C2D(p[0]) * 1000; freq += C2D(p[1]) * 100; freq += C2D(p[2]) * 10; freq += C2D(p[3]); freq *= i * 1000000; } #undef C2D tsc_freq = freq; } } } static void probe_tsc_freq(void) { u_int regs[4]; uint64_t tsc1, tsc2; if (cpu_high >= 6) { do_cpuid(6, regs); if ((regs[2] & CPUID_PERF_STAT) != 0) { /* * XXX Some emulators expose host CPUID without actual * support for these MSRs. We must test whether they * really work. */ wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); DELAY(10); if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0) tsc_perf_stat = 1; } } if (vm_guest == VM_GUEST_VMWARE) { tsc_freq_vmware(); return; } switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) >= 0x10)) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_mfence; } break; case CPU_VENDOR_INTEL: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && ((CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xe) || (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x3)))) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; case CPU_VENDOR_CENTAUR: if (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf && (rdmsr(0x1203) & 0x100000000ULL) == 0) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; } if (tsc_skip_calibration) { if (cpu_vendor_id == CPU_VENDOR_INTEL) tsc_freq_intel(); return; } if (bootverbose) printf("Calibrating TSC clock ... "); tsc1 = rdtsc(); DELAY(1000000); tsc2 = rdtsc(); tsc_freq = tsc2 - tsc1; if (bootverbose) printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq); } void init_TSC(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; #ifdef __i386__ /* The TSC is known to be broken on certain CPUs. */ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: switch (cpu_id & 0xFF0) { case 0x500: /* K5 Model 0 */ return; } break; case CPU_VENDOR_CENTAUR: switch (cpu_id & 0xff0) { case 0x540: /* * http://www.centtech.com/c6_data_sheet.pdf * * I-12 RDTSC may return incoherent values in EDX:EAX * I-13 RDTSC hangs when certain event counters are used */ return; } break; case CPU_VENDOR_NSC: switch (cpu_id & 0xff0) { case 0x540: if ((cpu_id & CPUID_STEPPING) == 0) return; break; } break; } #endif probe_tsc_freq(); /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); if (tsc_is_invariant) return; /* Register to find out about changes in CPU frequency. */ tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change, tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST); tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST); tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed, tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY); } #ifdef SMP /* * RDTSC is not a serializing instruction, and does not drain * instruction stream, so we need to drain the stream before executing * it. It could be fixed by use of RDTSCP, except the instruction is * not available everywhere. * * Use CPUID for draining in the boot-time SMP constistency test. The * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel * and VIA) when SSE2 is present, and nothing on older machines which * also do not issue RDTSC prematurely. There, testing for SSE2 and * vendor is too cumbersome, and we learn about TSC presence from CPUID. * * Do not use do_cpuid(), since we do not need CPUID results, which * have to be written into memory with do_cpuid(). */ #define TSC_READ(x) \ static void \ tsc_read_##x(void *arg) \ { \ uint64_t *tsc = arg; \ u_int cpu = PCPU_GET(cpuid); \ \ __asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx"); \ tsc[cpu * 3 + x] = rdtsc(); \ } TSC_READ(0) TSC_READ(1) TSC_READ(2) #undef TSC_READ #define N 1000 static void comp_smp_tsc(void *arg) { uint64_t *tsc; int64_t d1, d2; u_int cpu = PCPU_GET(cpuid); u_int i, j, size; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) CPU_FOREACH(j) { if (j == cpu) continue; d1 = tsc[cpu * 3 + 1] - tsc[j * 3]; d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1]; if (d1 <= 0 || d2 <= 0) { smp_tsc = 0; return; } } } static void adj_smp_tsc(void *arg) { uint64_t *tsc; int64_t d, min, max; u_int cpu = PCPU_GET(cpuid); u_int first, i, size; first = CPU_FIRST(); if (cpu == first) return; min = INT64_MIN; max = INT64_MAX; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) { d = tsc[first * 3] - tsc[cpu * 3 + 1]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3]; if (d < max) max = d; d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1]; if (d < max) max = d; } if (min > max) return; d = min / 2 + max / 2; __asm __volatile ( "movl $0x10, %%ecx\n\t" "rdmsr\n\t" "addl %%edi, %%eax\n\t" "adcl %%esi, %%edx\n\t" "wrmsr\n" : /* No output */ : "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32)) : "ax", "cx", "dx", "cc" ); } static int test_tsc(void) { uint64_t *data, *tsc; u_int i, size, adj; if ((!smp_tsc && !tsc_is_invariant) || vm_guest) return (-100); size = (mp_maxid + 1) * 3; data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK); adj = 0; retry: for (i = 0, tsc = data; i < N; i++, tsc += size) smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc); smp_tsc = 1; /* XXX */ - smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc, - smp_no_rendevous_barrier, data); + smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc, + smp_no_rendezvous_barrier, data); if (!smp_tsc && adj < smp_tsc_adjust) { adj++; - smp_rendezvous(smp_no_rendevous_barrier, adj_smp_tsc, - smp_no_rendevous_barrier, data); + smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc, + smp_no_rendezvous_barrier, data); goto retry; } free(data, M_TEMP); if (bootverbose) printf("SMP: %sed TSC synchronization test%s\n", smp_tsc ? "pass" : "fail", adj > 0 ? " after adjustment" : ""); if (smp_tsc && tsc_is_invariant) { switch (cpu_vendor_id) { case CPU_VENDOR_AMD: /* * Starting with Family 15h processors, TSC clock * source is in the north bridge. Check whether * we have a single-socket/multi-core platform. * XXX Need more work for complex cases. */ if (CPUID_TO_FAMILY(cpu_id) < 0x15 || (amd_feature2 & AMDID2_CMP) == 0 || smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1) break; return (1000); case CPU_VENDOR_INTEL: /* * XXX Assume Intel platforms have synchronized TSCs. */ return (1000); } return (800); } return (-100); } #undef N #else /* * The function is not called, it is provided to avoid linking failure * on uniprocessor kernel. */ static int test_tsc(void) { return (0); } #endif /* SMP */ static void init_TSC_tc(void) { uint64_t max_freq; int shift; if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; /* * Limit timecounter frequency to fit in an int and prevent it from * overflowing too fast. */ max_freq = UINT_MAX; /* * We can not use the TSC if we support APM. Precise timekeeping * on an APM'ed machine is at best a fools pursuit, since * any and all of the time spent in various SMM code can't * be reliably accounted for. Reading the RTC is your only * source of reliable time info. The i8254 loses too, of course, * but we need to have some kind of time... * We don't know at this point whether APM is going to be used * or not, nor when it might be activated. Play it safe. */ if (power_pm_get_type() == POWER_PM_TYPE_APM) { tsc_timecounter.tc_quality = -1000; if (bootverbose) printf("TSC timecounter disabled: APM enabled.\n"); goto init; } /* * Intel CPUs without a C-state invariant TSC can stop the TSC * in either C2 or C3. Disable use of C2 and C3 while using * the TSC as the timecounter. The timecounter can be changed * to enable C2 and C3. * * Note that the TSC is used as the cputicker for computing * thread runtime regardless of the timecounter setting, so * using an alternate timecounter and enabling C2 or C3 can * result incorrect runtimes for kernel idle threads (but not * for any non-idle threads). */ if (cpu_vendor_id == CPU_VENDOR_INTEL && (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) { tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP; if (bootverbose) printf("TSC timecounter disables C2 and C3.\n"); } /* * We can not use the TSC in SMP mode unless the TSCs on all CPUs * are synchronized. If the user is sure that the system has * synchronized TSCs, set kern.timecounter.smp_tsc tunable to a * non-zero value. The TSC seems unreliable in virtualized SMP * environments, so it is set to a negative quality in those cases. */ if (mp_ncpus > 1) tsc_timecounter.tc_quality = test_tsc(); else if (tsc_is_invariant) tsc_timecounter.tc_quality = 1000; max_freq >>= tsc_shift; init: for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++) ; if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) { if (cpu_vendor_id == CPU_VENDOR_AMD) { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_mfence : tsc_get_timecount_mfence; } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_lfence : tsc_get_timecount_lfence; } } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low : tsc_get_timecount; } if (shift > 0) { tsc_timecounter.tc_name = "TSC-low"; if (bootverbose) printf("TSC timecounter discards lower %d bit(s)\n", shift); } if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; tc_init(&tsc_timecounter); } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); /* * When cpufreq levels change, find out about the (new) max frequency. We * use this to update CPU accounting in case it got a lower estimate at boot. */ static void tsc_levels_changed(void *arg, int unit) { device_t cf_dev; struct cf_level *levels; int count, error; uint64_t max_freq; /* Only use values from the first CPU, assuming all are equal. */ if (unit != 0) return; /* Find the appropriate cpufreq device instance. */ cf_dev = devclass_get_device(devclass_find("cpufreq"), unit); if (cf_dev == NULL) { printf("tsc_levels_changed() called but no cpufreq device?\n"); return; } /* Get settings from the device and find the max frequency. */ count = 64; levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return; error = CPUFREQ_LEVELS(cf_dev, levels, &count); if (error == 0 && count != 0) { max_freq = (uint64_t)levels[0].total_set.freq * 1000000; set_cputicker(rdtsc, max_freq, 1); } else printf("tsc_levels_changed: no max freq found\n"); free(levels, M_TEMP); } /* * If the TSC timecounter is in use, veto the pending change. It may be * possible in the future to handle a dynamically-changing timecounter rate. */ static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status) { if (*status != 0 || timecounter != &tsc_timecounter) return; printf("timecounter TSC must not be in use when " "changing frequencies; change denied\n"); *status = EBUSY; } /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { uint64_t freq; /* If there was an error during the transition, don't do anything. */ if (tsc_disabled || status != 0) return; /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; atomic_store_rel_64(&tsc_freq, freq); tsc_timecounter.tc_frequency = freq >> (int)(intptr_t)tsc_timecounter.tc_priv; } static int sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS) { int error; uint64_t freq; freq = atomic_load_acq_64(&tsc_freq); if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); if (error == 0 && req->newptr != NULL) { atomic_store_rel_64(&tsc_freq, freq); atomic_store_rel_64(&tsc_timecounter.tc_frequency, freq >> (int)(intptr_t)tsc_timecounter.tc_priv); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW, 0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency"); static u_int tsc_get_timecount(struct timecounter *tc __unused) { return (rdtsc32()); } static inline u_int tsc_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx"); return (rv); } static u_int tsc_get_timecount_lfence(struct timecounter *tc __unused) { lfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_lfence(struct timecounter *tc) { lfence(); return (tsc_get_timecount_low(tc)); } static u_int tsc_get_timecount_mfence(struct timecounter *tc __unused) { mfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_mfence(struct timecounter *tc) { mfence(); return (tsc_get_timecount_low(tc)); } static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th->th_x86_hpet_idx = 0xffffffff; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th32->th_x86_hpet_idx = 0xffffffff; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (1); } #endif Index: stable/11 =================================================================== --- stable/11 (revision 328385) +++ stable/11 (revision 328386) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r316648