Index: head/share/man/man9/vm_page_wire.9 =================================================================== --- head/share/man/man9/vm_page_wire.9 (revision 352406) +++ head/share/man/man9/vm_page_wire.9 (revision 352407) @@ -1,83 +1,83 @@ .\" .\" Copyright (C) 2001 Chad David . All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH .\" DAMAGE. .\" .\" $FreeBSD$ .\" .Dd September 9, 2019 .Dt VM_PAGE_WIRE 9 .Os .Sh NAME .Nm vm_page_wire , .Nm vm_page_unwire , .Nm vm_page_unwire_noq .Nd "wire and unwire pages" .Sh SYNOPSIS .In sys/param.h .In vm/vm.h .In vm/vm_page.h .Ft void .Fn vm_page_wire "vm_page_t m" .Ft bool .Fn vm_page_wire_mapped "vm_page_t m" .Ft void .Fn vm_page_unwire "vm_page_t m" "int queue" .Ft bool .Fn vm_page_unwire_noq "vm_page_t m" .Sh DESCRIPTION The .Fn vm_page_wire and .Fn vm_page_wire_mapped -functions wire the page, which prevents it from being reclaimed by the page +function wire the page, prevent it from being reclaimed by the page daemon or when its containing object is destroyed. Both functions require that the page belong to an object. The .Fn vm_page_wire_mapped function is for use by the .Xr pmap 9 layer following a lookup. This function may fail if mappings of the page are concurrently being destroyed, in which case it will return false. .Pp The .Fn vm_page_unwire and .Fn vm_page_unwire_noq functions release a wiring of a page. The .Fn vm_page_unwire function takes a queue index and will insert the page into the corresponding page queue upon releasing its last wiring. If the page does not belong to an object and no other references to the page exist, .Fn vm_page_unwire will free the page. .Fn vm_page_unwire_noq releases the wiring and returns true if it was the last wiring of the page. .Sh AUTHORS This manual page was written by .An Chad David Aq Mt davidc@acns.ab.ca . Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 352406) +++ head/sys/amd64/amd64/pmap.c (revision 352407) @@ -1,10306 +1,10308 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * Copyright (c) 2014-2019 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_ddb.h" #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static pt_entry_t pg_g; static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = pg_g; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_pku_mask_bit(pmap_t pmap) { return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); } #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pti, 0, "Page Table Isolation enabled"); static vm_object_t pti_obj; static pml4_entry_t *pti_pml4; static vm_pindex_t pti_pg_idx; static bool pti_finalized; struct pmap_pkru_range { struct rs_el pkru_rs_el; u_int pkru_keyidx; int pkru_flags; }; static uma_zone_t pmap_pkru_ranges_zone; static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void *pkru_dup_range(void *ctx, void *data); static void pkru_free_range(void *ctx, void *node); static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void pmap_pkru_deassign_all(pmap_t pmap); static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { int i; uint64_t res; res = 0; CPU_FOREACH(i) { res += cpuid_to_pcpu[i]->pc_pm_save_cnt; } return (sysctl_handle_64(oidp, &res, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; static struct pmap_invl_gen pmap_invl_gen_head = { .gen = 1, .next = NULL, }; static u_long pmap_invl_gen = 1; static int pmap_invl_waiters; static struct callout pmap_invl_callout; static bool pmap_invl_callout_inited; #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) static bool pmap_di_locked(void) { int tun; if ((cpu_feature2 & CPUID2_CX16) == 0) return (true); tun = 0; TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); return (tun != 0); } static int sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) { int locked; locked = pmap_di_locked(); return (sysctl_handle_int(oidp, &locked, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", "Locked delayed invalidation"); static bool pmap_not_in_di_l(void); static bool pmap_not_in_di_u(void); DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) { return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); } static bool pmap_not_in_di_l(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (invl_gen->gen == 0); } static void pmap_thread_init_invl_gen_l(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; } static void pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) { struct turnstile *ts; ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > atomic_load_long(invl_gen)) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } static void pmap_delayed_invl_finish_unblock(u_long new_gen) { struct turnstile *ts; turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); if (new_gen != 0) pmap_invl_gen = new_gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts); } turnstile_chain_unlock(&invl_gen_ts); } /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_start_l(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finish_l(void) { struct pmap_invl_gen *invl_gen, *next; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) pmap_delayed_invl_finish_unblock(invl_gen->gen); else next->gen = invl_gen->gen; LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } static bool pmap_not_in_di_u(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); } static void pmap_thread_init_invl_gen_u(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; } static bool pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) { uint64_t new_high, new_low, old_high, old_low; char res; old_low = new_low = 0; old_high = new_high = (uintptr_t)0; __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); if (res == 0) { if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) return (false); out->gen = old_low; out->next = (void *)old_high; } else { out->gen = new_low; out->next = (void *)new_high; } return (true); } static bool pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, struct pmap_invl_gen *new_val) { uint64_t new_high, new_low, old_high, old_low; char res; new_low = new_val->gen; new_high = (uintptr_t)new_val->next; old_low = old_val->gen; old_high = (uintptr_t)old_val->next; __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); return (res); } #ifdef PV_STATS static long invl_start_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, &invl_start_restart, 0, ""); static long invl_finish_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, &invl_finish_restart, 0, ""); static int invl_max_qlen; SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, &invl_max_qlen, 0, ""); #endif static struct lock_delay_config __read_frequently di_delay; LOCK_DELAY_SYSINIT_DEFAULT(di_delay); static void pmap_delayed_invl_start_u(void) { struct pmap_invl_gen *invl_gen, *p, prev, new_prev; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; u_char pri; #ifdef PV_STATS int i, ii; #endif td = curthread; invl_gen = &td->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); lock_delay_arg_init(&lda, &di_delay); invl_gen->saved_pri = 0; pri = td->td_base_pri; if (pri > PVM) { thread_lock(td); pri = td->td_base_pri; if (pri > PVM) { invl_gen->saved_pri = pri; sched_prio(td, PVM); } thread_unlock(td); } again: PV_STAT(i = 0); for (p = &pmap_invl_gen_head;; p = prev.next) { PV_STAT(i++); prevl = atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } if (prevl == 0) break; prev.next = (void *)prevl; } #ifdef PV_STATS if ((ii = invl_max_qlen) < i) atomic_cmpset_int(&invl_max_qlen, ii, i); #endif if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } new_prev.gen = prev.gen; new_prev.next = invl_gen; invl_gen->gen = prev.gen + 1; /* Formal fence between store to invl->gen and updating *p. */ atomic_thread_fence_rel(); /* * After inserting an invl_gen element with invalid bit set, * this thread blocks any other thread trying to enter the * delayed invalidation block. Do not allow to remove us from * the CPU, because it causes starvation for other threads. */ critical_enter(); /* * ABA for *p is not possible there, since p->gen can only * increase. So if the *p thread finished its di, then * started a new one and got inserted into the list at the * same place, its gen will appear greater than the previously * read gen. */ if (!pmap_di_store_invl(p, &prev, &new_prev)) { critical_exit(); PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } /* * There we clear PMAP_INVL_GEN_NEXT_INVALID in * invl_gen->next, allowing other threads to iterate past us. * pmap_di_store_invl() provides fence between the generation * write and the update of next. */ invl_gen->next = NULL; critical_exit(); } static bool pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, struct pmap_invl_gen *p) { struct pmap_invl_gen prev, new_prev; u_long mygen; /* * Load invl_gen->gen after setting invl_gen->next * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger * generations to propagate to our invl_gen->gen. Lock prefix * in atomic_set_ptr() worked as seq_cst fence. */ mygen = atomic_load_long(&invl_gen->gen); if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) return (false); KASSERT(prev.gen < mygen, ("invalid di gen sequence %lu %lu", prev.gen, mygen)); new_prev.gen = mygen; new_prev.next = (void *)((uintptr_t)invl_gen->next & ~PMAP_INVL_GEN_NEXT_INVALID); /* Formal fence between load of prev and storing update to it. */ atomic_thread_fence_rel(); return (pmap_di_store_invl(p, &prev, &new_prev)); } static void pmap_delayed_invl_finish_u(void) { struct pmap_invl_gen *invl_gen, *p; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; td = curthread; invl_gen = &td->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, ("missed invl_start: INVALID")); lock_delay_arg_init(&lda, &di_delay); again: for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { prevl = atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } if ((void *)prevl == invl_gen) break; } /* * It is legitimate to not find ourself on the list if a * thread before us finished its DI and started it again. */ if (__predict_false(p == NULL)) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_enter(); atomic_set_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { atomic_clear_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); critical_exit(); PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_exit(); if (atomic_load_int(&pmap_invl_waiters) > 0) pmap_delayed_invl_finish_unblock(0); if (invl_gen->saved_pri != 0) { thread_lock(td); sched_prio(td, invl_gen->saved_pri); thread_unlock(td); } } #ifdef DDB DB_SHOW_COMMAND(di_queue, pmap_di_queue) { struct pmap_invl_gen *p, *pn; struct thread *td; uintptr_t nextl; bool first; for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, first = false) { nextl = atomic_load_ptr(&p->next); pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); td = first ? NULL : __containerof(p, struct thread, td_md.md_invl_gen); db_printf("gen %lu inv %d td %p tid %d\n", p->gen, (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, td != NULL ? td->td_tid : -1); } } #endif #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, "Number of times DI invalidation blocked pmap_remove_all/write"); static long invl_wait_slow; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, "Number of slow invalidation waits for lockless DI"); #endif static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } static void pmap_delayed_invl_callout_func(void *arg __unused) { if (atomic_load_int(&pmap_invl_waiters) == 0) return; pmap_delayed_invl_finish_unblock(0); } static void pmap_delayed_invl_callout_init(void *arg __unused) { if (pmap_di_locked()) return; callout_init(&pmap_invl_callout, 1); pmap_invl_callout_inited = true; } SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_delayed_invl_callout_init, NULL); /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait_l(vm_page_t m) { u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { atomic_add_long(&invl_wait, 1); accounted = true; } #endif pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); } } static void pmap_delayed_invl_wait_u(vm_page_t m) { u_long *m_gen; struct lock_delay_arg lda; bool fast; fast = true; m_gen = pmap_delayed_invl_genp(m); lock_delay_arg_init(&lda, &di_delay); while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { if (fast || !pmap_invl_callout_inited) { PV_STAT(atomic_add_long(&invl_wait, 1)); lock_delay(&lda); fast = false; } else { /* * The page's invalidation generation number * is still below the current thread's number. * Prepare to block so that we do not waste * CPU cycles or worse, suffer livelock. * * Since it is impossible to block without * racing with pmap_delayed_invl_finish_u(), * prepare for the race by incrementing * pmap_invl_waiters and arming a 1-tick * callout which will unblock us if we lose * the race. */ atomic_add_int(&pmap_invl_waiters, 1); /* * Re-check the current thread's invalidation * generation after incrementing * pmap_invl_waiters, so that there is no race * with pmap_delayed_invl_finish_u() setting * the page generation and checking * pmap_invl_waiters. The only race allowed * is for a missed unblock, which is handled * by the callout. */ if (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { callout_reset(&pmap_invl_callout, 1, pmap_delayed_invl_callout_func, NULL); PV_STAT(atomic_add_long(&invl_wait_slow, 1)); pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen_head.gen); } atomic_add_int(&pmap_invl_waiters, -1); } } } DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) { return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_start_l : pmap_delayed_invl_start_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_finish_l : pmap_delayed_invl_finish_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) { return (pmap_di_locked() ? pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u); } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finish(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ /* * Internal flags for pmap_mapdev_internal() and * pmap_change_attr_locked(). */ #define MAPDEV_FLUSHCACHE 0x0000001 /* Flush cache after mapping. */ #define MAPDEV_SETATTR 0x0000002 /* Modify existing attrs. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, int flags); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec); static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); return (PTmap + ((va >> PAGE_SHIFT) & mask)); } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); return (PDmap + ((va >> PDRSHIFT) & mask)); } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr, 1 << PDRSHIFT); pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } /* * Returns the proper write/execute permission for a physical page that is * part of the initial boot allocations. * * If the page has kernel text, it is marked as read-only. If the page has * kernel read-only data, it is marked as read-only/not-executable. If the * page has only read-write data, it is marked as read-write/not-executable. * If the page is below/above the kernel range, it is marked as read-write. * * This function operates on 2M pages, since we map the kernel space that * way. * * Note that this doesn't currently provide any protection for modules. */ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { /* * Everything in the same 2M page as the start of the kernel * should be static. On the other hand, things in the same 2M * page as the end of the kernel could be read-write/executable, * as the kernel image is not guaranteed to end on a 2M boundary. */ if (pa < trunc_2mpage(btext - KERNBASE) || pa >= trunc_2mpage(_end - KERNBASE)) return (X86_PG_RW); /* * The linker should ensure that the read-only and read-write * portions don't share the same 2M page, so this shouldn't * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ if (pa >= trunc_2mpage(brwsection - KERNBASE)) return (X86_PG_RW | pg_nx); /* * Mark any 2M page containing kernel text as read-only. Mark * other pages with read-only data as read-only and not executable. * (It is likely a small portion of the read-only data section will * be marked as read-only, but executable. This should be acceptable * since the read-only protection will keep the data from changing.) * Note that fixups to the .text section will still work until we * set CR0.WP. */ if (pa < round_2mpage(etext - KERNBASE)) return (0); return (pg_nx); } static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe, nkdmpde; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { /* * Calculate the number of 1G pages that will fully fit in * Maxmem. */ ndm1g = ptoa(Maxmem) >> PDPSHIFT; /* * Allocate 2M pages for the kernel. These will be used in * place of the first one or more 1G pages from ndm1g. */ nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); /* * Connect the zero-filled PT pages to their PD entries. This * implicitly maps the PT pages at their correct locations within * the PTmap. */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Map from physical address zero to the end of loader preallocated * memory using 2MB pages. This replaces some of the PD entries * created above. */ for (i = 0; (i << PDRSHIFT) < KERNend; i++) /* Preset PG_M and PG_A because demotion expects it. */ pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); /* * Because we map the physical blocks in 2M pages, adjust firstaddr * to record the physical blocks we've actually mapped into kernel * virtual address space. */ if (*firstaddr < round_2mpage(KERNend)) *firstaddr = round_2mpage(KERNend); /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V; } /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with appropriate permissions. (If using 1G pages, * this will partially overwrite the PDPEs above.) */ if (ndm1g) { pd_p = (pd_entry_t *)DMPDkernphys; for (i = 0; i < (NPDEPG * nkdmpde); i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx | bootaddr_rwx(i << PDRSHIFT); for (i = 0; i < nkdmpde; i++) pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte, *pcpu_pte; uint64_t cr4, pcpu_phys; u_long res; int i; KERNend = *firstaddr; res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); pcpu_phys = allocpages(firstaddr, MAXCPU); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Account for the virtual addresses mapped by create_pagetables(). */ virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it * is possible to enable SMEP and SMAP since PG_U bits are * correct now. */ cr4 = rcr4(); cr4 |= CR4_PGE; load_cr4(cr4); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; load_cr4(cr4); /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); virtual_avail = va; for (i = 0; i < MAXCPU; i++) { pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | pg_g | pg_nx | X86_PG_M | X86_PG_A; } STAILQ_INIT(&cpuhead); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); amd64_bsp_pcpu_init1(&__pcpu[0]); amd64_bsp_ist_init(&__pcpu[0]); __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } /* * PMAP_PCID_KERN + 1 is used for initialization of * proc0 pmap. The pmap' pcid state might be used by * EFIRT entry before first context switch, so it * needs to be valid. */ PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); PCPU_SET(pcid_gen, 1); /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = -1; pat_index[PAT_WRITE_BACK] = 0; pat_index[PAT_WRITE_THROUGH] = 1; pat_index[PAT_UNCACHEABLE] = 3; pat_index[PAT_WRITE_COMBINING] = 6; pat_index[PAT_WRITE_PROTECTED] = 5; pat_index[PAT_UNCACHED] = 2; /* * Initialize default PAT entries. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * * Leave 4 and 7 as WB and UC. Note that a recursive page table * mapping for a 2M page uses a PAT value with the bit 3 set due * to its overload with PG_PS. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; vm_size_t s; int error, i, pv_npg, ret, skz63; /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); /* Detect bare-metal Skylake Server and Skylake-X. */ if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { /* * Skylake-X errata SKZ63. Processor May Hang When * Executing Code In an HLE Transaction Region between * 40000000H and 403FFFFFH. * * Mark the pages in the range as preallocated. It * seems to be impossible to distinguish between * Skylake Server and Skylake X. */ skz63 = 1; TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); if (skz63 != 0) { if (bootverbose) printf("SKZ63: skipping 4M RAM starting " "at physical 1G\n"); for (i = 0; i < atop(0x400000); i++) { ret = vm_page_blacklist_add(0x40000000 + ptoa(i), FALSE); if (!ret && bootverbose) printf("page at %#lx already used\n", 0x40000000 + ptoa(i)); } } } /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); mpte->wire_count = 1; /* * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ if (i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); if (lm_ents > LMEPML4I - LMSPML4I + 1) lm_ents = LMEPML4I - LMSPML4I + 1; if (bootverbose) printf("pmap: large map %u PML4 slots (%lu Gb)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } } } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, "1GB page mapping counters"); static u_long pmap_pdpe_demotions; SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, 0, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } static int pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) { int pat_flag, pat_idx; pat_idx = 0; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; if ((pte & pat_flag) != 0) pat_idx |= 0x4; if ((pte & PG_NC_PCD) != 0) pat_idx |= 0x2; if ((pte & PG_NC_PWT) != 0) pat_idx |= 0x1; break; case PT_EPT: if ((pte & EPT_PG_IGNORE_PAT) != 0) panic("EPT PTE %#lx has no PAT memory type", pte); pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; break; } /* See pmap_init_pat(). */ if (pat_idx == 4) pat_idx = 0; if (pat_idx == 7) pat_idx = 3; return (pat_idx); } bool pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3) { /* * Because pm_pcid is recalculated on a * context switch, we must disable switching. * Otherwise, we might use a stale value * below. */ critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* * The fence is between stores to pm_gen and the read of the * pm_active mask. We need to ensure that it is impossible * for us to miss the bit update in pm_active and * simultaneously observe a non-zero pm_gen in * pmap_activate_sw(), otherwise TLB update is missed. * Without the fence, IA32 allows such an outcome. Note that * pm_active is updated by a locked operation, which provides * the reciprocal fence. */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, true); } static void pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, false); } static void pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) { } DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : pmap_invalidate_page_pcid_noinvpcid); return (pmap_invalidate_page_nopcid); } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { invlpg(va); } else { if (pmap == PCPU_GET(curpmap)) invlpg(va); pmap_invalidate_page_mode(pmap, va); } smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, true); } static void pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, false); } static void pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { } DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : pmap_invalidate_range_pcid_noinvpcid); return (pmap_invalidate_range_nopcid); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } pmap_invalidate_range_mode(pmap, sva, eva); } smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); sched_unpin(); } static inline void pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; if (pmap == kernel_pmap) { if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pcid; ucr3 = pmap->pm_ucr3; if (ucr3 != PMAP_NO_CR3) { ucr3 |= pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else { load_cr3(kcr3); } } critical_exit(); } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_all_pcid_invpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, true); } static void pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, false); } static void pmap_invalidate_all_nopcid(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : pmap_invalidate_all_pcid_noinvpcid); return (pmap_invalidate_all_nopcid); } void pmap_invalidate_all(pmap_t pmap) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); pmap_invalidate_all_mode(pmap); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { invlpg(va); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[0].pm_pcid; if (invpcid_works) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct invpcid_descr d; vm_offset_t addr; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; if (pmap->pm_ucr3 != PMAP_NO_CR3) { ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 0].pm_pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); } critical_exit(); } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + NBPDR). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + NBPDR), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t sva, vm_offset_t eva)) { if ((cpu_feature & CPUID_SS) != 0) return (pmap_invalidate_cache_range_selfsnoop); if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_force_invalidate_cache_range); return (pmap_invalidate_cache_range_all); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) static void pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); } void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC range. The * local APIC is always uncached, so we don't need to flush * for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); pmap_invalidate_cache(); } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } void pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { pmap_force_invalidate_cache_range(sva, eva); return; } /* See comment in pmap_force_invalidate_cache_range(). */ if (pmap_kextract(sva) == lapic_paddr) return; sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clwb(sva); sfence(); } void pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) { pt_entry_t *pte; vm_offset_t vaddr; int error, pte_bits; KASSERT((spa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: spa not page-aligned")); KASSERT((epa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: epa not page-aligned")); if (spa < dmaplimit) { pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( dmaplimit, epa))); if (dmaplimit >= epa) return; spa = dmaplimit; } pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | X86_PG_V; error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); pte = vtopte(vaddr); for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); invlpg(vaddr); /* XXXKIB sfences inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); } vmem_free(kernel_arena, vaddr, PAGE_SIZE); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; + vm_paddr_t pa; vm_page_t m; + pa = 0; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); } else { pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) != 0 && ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0)) m = PHYS_TO_VM_PAGE(pte & PG_FRAME); } if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { struct proc *p; struct thread *td; int i; PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); pmap->pm_pml4u = NULL; pmap->pm_cr3 = KPML4phys; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); td = curthread; if (pti) { p = td->td_proc; PROC_LOCK(p); p->p_md.md_flags |= P_MD_KPTI; PROC_UNLOCK(p); } pmap_thread_init_invl_gen(td); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; } static void pmap_pinit_pml4_pti(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); for (i = 0; i < NPML4EPG; i++) pm_pml4[i] = pti_pml4[i]; } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg, pml4pgu; vm_paddr_t pml4phys; int i; /* * allocate the page directory page */ pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pml4phys = VM_PAGE_TO_PHYS(pml4pg); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_pml4u = NULL; pmap->pm_type = pm_type; if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(pml4pgu)); pmap_pinit_pml4_pti(pml4pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, pkru_free_range, pmap, M_NOWAIT); } } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4, *pml4u; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that * we detect any programming errors that leave * the kernel-mode page table active on return * to user space. */ if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; pml4u = &pmap->pm_pml4u[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->wire_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->wire_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->wire_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ for (i = 0; i < lm_ents; i++) /* Large Map */ pmap->pm_pml4[LMSPML4I + i] = 0; vm_page_unwire_noq(m); vm_page_free_zero(m); if (pmap->pm_pml4u != NULL) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); vm_page_unwire_noq(m); vm_page_free(m); } if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_fini(&pmap->pm_pkru); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * Allocate physical memory for the vm_page array and map it into KVA, * attempting to back the vm_pages with domain-local memory. */ void pmap_page_array_startup(long pages) { pdp_entry_t *pdpe; pd_entry_t *pde, newpdir; vm_offset_t va, start, end; vm_paddr_t pa; long pfn; int domain, i; vm_page_array_size = pages; start = va = VM_MIN_KERNEL_ADDRESS; end = va + pages * sizeof(struct vm_page); while (va < end) { pfn = first_page + (va - start) / sizeof(struct vm_page); domain = _vm_phys_domain(ptoa(pfn)); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) { pa = vm_phys_early_alloc(domain, PAGE_SIZE); dump_add_page(pa); pagezero((void *)PHYS_TO_DMAP(pa)); *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); } pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) != 0) panic("Unexpected pde"); pa = vm_phys_early_alloc(domain, NBPDR); for (i = 0; i < NPDEPG; i++) dump_add_page(pa + i * PAGE_SIZE); newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS | pg_g | pg_nx); pde_store(pde, newpdir); va += NBPDR; } vm_page_array = (vm_page_t)start; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) pmap_delayed_invl_finish(); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; bool start_di; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; /* * A delayed invalidation block should already be active if * pmap_advise() or pmap_remove() called this function by way * of pmap_demote_pde_locked(). */ start_di = pmap_not_in_di(); mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_start(); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_start(); mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } else if (start_di) pmap_delayed_invl_start(); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static void pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) { #ifdef INVARIANTS #ifdef DIAGNOSTIC pt_entry_t *xpte, *ypte; for (xpte = firstpte; xpte < firstpte + NPTEPG; xpte++, newpte += PAGE_SIZE) { if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { printf("pmap_demote_pde: xpte %zd and newpte map " "different pages: found %#lx, expected %#lx\n", xpte - firstpte, *xpte, newpte); printf("page table dump\n"); for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) printf("%zd %#lx\n", ypte - firstpte, *ypte); panic("firstpte"); } } #else KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); #endif #endif } static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t oldpde, struct rwlock **lockp) { struct spglist free; vm_offset_t sva; SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", va, pmap); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; int PG_PTE_CACHE; bool in_kernel; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); in_kernel = va >= VM_MAXUSER_ADDRESS; oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldpde & PG_A) == 0) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (mpte == NULL) { pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } if (!in_kernel) { mpte->wire_count = NPTEPG; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (in_kernel) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_G = pmap_global_bit(pmap); anyvalid = false; va = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } if ((*pte & PG_G) == 0) anyvalid = true; else if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { sva += PAGE_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_start(); PMAP_LOCK(pmap); pmap_pkru_on_remove(pmap, sva, eva); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) anyvalid = 1; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; /* * Although this function delays and batches the invalidation * of stale TLB entries, it does not need to call * pmap_delayed_invl_start() and * pmap_delayed_invl_finish(), because it does not * ordinarily destroy mappings. Stale TLB entries from * protection-only changes need only be invalidated before the * pmap lock is released, because protection-only changes do * not destroy PV entries. Even operations that iterate over * a physical page's PV list of mappings, like * pmap_remove_write(), acquire the pmap lock for each * mapping. Consequently, for protection-only changes, the * pmap lock suffices to synchronize both page table and TLB * updates. * * This function only destroys a mapping if pmap_demote_pde() * fails. In that case, stale TLB entries are immediately * invalidated. */ PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); atomic_add_long(&pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; pv = NULL; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If pkru is not same for the whole pde range, return failure * and let vm_fault() cope. Check after pde allocation, since * it could sleep. */ if (!pmap_pkru_same(pmap, va, va + NBPDR)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } return (KERN_FAILURE); } if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { newpde &= ~X86_PG_PKU_MASK; newpde |= pmap_pkru_get(pmap, va); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; oldpde = *pde; if ((oldpde & PG_V) != 0) { KASSERT(pdpg->wire_count > 1, ("pmap_enter_pde: pdpg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { pdpg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * The reference to the PD page that was acquired by * pmap_allocpde() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finish(); } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); atomic_add_long(&pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pt_entry_t newpte, *pte, PG_V; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pdpg = pmap_allocpde(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(addr)]; if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->wire_count--; KASSERT(pdpg->wire_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_start()/finish() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) { va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); if (dst_pdpg == NULL) break; pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); pde = &pde[pmap_pde_index(addr)]; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else dst_pdpg->wire_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = *src_pte; /* * We only virtual copy managed pages. */ if ((ptetemp & PG_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_pde_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * paging-structure caches could * nonetheless have entries that refer * to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) { int error; if (dst_pmap->pm_type != src_pmap->pm_type || dst_pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (0); for (;;) { if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } error = pmap_pkru_copy(dst_pmap, src_pmap); /* Clean up partial copy on failure due to no memory. */ if (error == ENOMEM) pmap_pkru_deassign_all(dst_pmap); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_start() and * pmap_delayed_invl_finish(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) - if ((vm_page_aflags(mt) & PGA_WRITEABLE) != 0 && + if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - if ((vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; bool anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = false; pmap_delayed_invl_start(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldpde & PG_W) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); anychanged = true; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = true; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_M, PG_RW; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && pmap_demote_pde_locked(pmap, pde, va, &lock) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); atomic_clear_long(pte, PG_M | PG_RW); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~mask; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && (ppim->mode == mode || (flags & MAPDEV_SETATTR) == 0)) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); if ((flags & MAPDEV_SETATTR) != 0) { PMAP_LOCK(kernel_pmap); i = pmap_change_attr_locked(va, size, mode, flags); PMAP_UNLOCK(kernel_pmap); } else i = 0; if (!i) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | MAPDEV_SETATTR)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, MAPDEV_SETATTR)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, MAPDEV_FLUSHCACHE)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pdpgpa; vm_page_t pdpg; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); pmap_pdpe_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode, MAPDEV_FLUSHCACHE); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, int flags) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) return (EINVAL); if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) return (EINVAL); if (*pde & PG_PS) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pdpe, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte, X86_PG_PTE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode, flags); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V)) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) return (pti ? 0 : CR3_PCID_SAVE); if (pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), ("cpu %d pcid_next %#x", cpuid, pcid_next)); if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } static uint64_t pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) { uint64_t cached; cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); return (cached); } static void pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) { PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? PCPU_GET(pti_rsp0) : (uintptr_t)td->td_pcb; } static void inline pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) { struct invpcid_descr d; uint64_t cached, cr3, kcr3, ucr3; cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); PCPU_SET(curpmap, pmap); kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | PMAP_PCID_USER_PT; if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { /* * Explicitly invalidate translations cached from the * user page table. They are not automatically * flushed by reload of cr3 with the kernel page table * pointer above. * * Note that the if() condition is resolved statically * by using the function argument instead of * runtime-evaluated invpcid_works value. */ if (invpcid_works1) { d.pcid = PMAP_PCID_USER_PT | pmap->pm_pcids[cpuid].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { pmap_pti_pcid_invalidate(ucr3, kcr3); } } PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { pmap_activate_sw_pcid_pti(pmap, cpuid, true); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { register_t rflags; /* * If the INVPCID instruction is not available, * invltlb_pcid_handler() is used to handle an invalidate_all * IPI, which checks for curpmap == smp_tlb_pmap. The below * sequence of operations has a window where %CR3 is loaded * with the new pmap's PML4 address, but the curpmap value has * not yet been updated. This causes the invltlb IPI handler, * which is called between the updates, to execute as a NOP, * which leaves stale TLB entries. * * Note that the most typical use of pmap_activate_sw(), from * the context switch, is immune to this race, because * interrupts are disabled (while the thread lock is owned), * and the IPI happens after curpmap is updated. Protect * other callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap assignment. */ rflags = intr_disable(); pmap_activate_sw_pcid_pti(pmap, cpuid, false); intr_restore(rflags); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { uint64_t cached, cr3; cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); PCPU_SET(curpmap, pmap); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { register_t rflags; rflags = intr_disable(); pmap_activate_sw_pcid_nopti(td, pmap, cpuid); intr_restore(rflags); } static void pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid __unused) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } static void pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, u_int cpuid __unused) { pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); PCPU_SET(kcr3, pmap->pm_cr3); PCPU_SET(ucr3, pmap->pm_ucr3); pmap_activate_sw_pti_post(td, pmap); } DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, u_int)) { if (pmap_pcid_enabled && pti && invpcid_works) return (pmap_activate_sw_pcid_invpcid_pti); else if (pmap_pcid_enabled && pti && !invpcid_works) return (pmap_activate_sw_pcid_noinvpcid_pti); else if (pmap_pcid_enabled && !pti && invpcid_works) return (pmap_activate_sw_pcid_nopti); else if (pmap_pcid_enabled && !pti && !invpcid_works) return (pmap_activate_sw_pcid_noinvpcid_nopti); else if (!pmap_pcid_enabled && pti) return (pmap_activate_sw_nopcid_pti); else /* if (!pmap_pcid_enabled && !pti) */ return (pmap_activate_sw_nopcid_nopti); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) return; cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif pmap_activate_sw_mode(td, pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { uint64_t kcr3; u_int cpuid; /* * kernel_pmap must be never deactivated, and we ensure that * by never activating it at all. */ MPASS(pmap != kernel_pmap); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); if (pti) { kcr3 = pmap->pm_cr3; if (pmap_pcid_enabled) kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; } else { kcr3 = PMAP_NO_CR3; } PCPU_SET(kcr3, kcr3); PCPU_SET(ucr3, PMAP_NO_CR3); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } #if VM_NRESERVLEVEL > 0 /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #endif #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error __unused, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } /* * Pdp pages from the large map are managed differently from either * kernel or user page table pages. They are permanently allocated at * initialization time, and their wire count is permanently set to * zero. The pml4 entries pointing to those pages are copied into * each allocated pmap. * * In contrast, pd and pt pages are managed like user page table * pages. They are dynamically allocated, and their wire count * represents the number of valid entries within the page. */ static vm_page_t pmap_large_map_getptp_unlocked(void) { vm_page_t m; m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_ZERO); if (m != NULL && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } static vm_page_t pmap_large_map_getptp(void) { vm_page_t m; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); m = pmap_large_map_getptp_unlocked(); if (m == NULL) { PMAP_UNLOCK(kernel_pmap); vm_wait(NULL); PMAP_LOCK(kernel_pmap); /* Callers retry. */ } return (m); } static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { vm_pindex_t pml4_idx; vm_paddr_t mphys; pml4_idx = pmap_pml4e_index(va); KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } static pd_entry_t * pmap_large_map_pde(vm_offset_t va) { pdp_entry_t *pdpe; vm_page_t m; vm_paddr_t mphys; retry: pdpe = pmap_large_map_pdpe(va); if (*pdpe == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & PG_FRAME; } return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); } static pt_entry_t * pmap_large_map_pte(vm_offset_t va) { pd_entry_t *pde; vm_page_t m; vm_paddr_t mphys; retry: pde = pmap_large_map_pde(va); if (*pde == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & PG_FRAME; } return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); } static vm_paddr_t pmap_large_map_kextract(vm_offset_t va) { pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte, pt; KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), ("not largemap range %#lx", (u_long)va)); pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) return ((pd & PG_PS_FRAME) | (va & PDRMASK)); pte = pmap_pde_to_pte(pde, va); pt = *pte; KASSERT((pt & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); return ((pt & PG_FRAME) | (va & PAGE_MASK)); } static int pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, vmem_addr_t *vmem_res) { /* * Large mappings are all but static. Consequently, there * is no point in waiting for an earlier allocation to be * freed. */ return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); } int pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, vm_memattr_t mattr) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va, inc; vmem_addr_t vmem_res; vm_paddr_t pa; int error; if (len == 0 || spa + len < spa) return (EINVAL); /* See if DMAP can serve. */ if (spa + len <= dmaplimit) { va = PHYS_TO_DMAP(spa); *addr = (void *)va; return (pmap_change_attr(va, len, mattr)); } /* * No, allocate KVA. Fit the address with best possible * alignment for superpages. Fall back to worse align if * failed. */ error = ENOMEM; if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, NBPDP) >= roundup2(spa, NBPDP) + NBPDP) error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, &vmem_res); if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, NBPDR) + NBPDR) error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, &vmem_res); if (error != 0) error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); if (error != 0) return (error); /* * Fill pagetable. PG_M is not pre-set, we scan modified bits * in the pagetable to minimize flushing. No need to * invalidate TLB, since we only update invalid entries. */ PMAP_LOCK(kernel_pmap); for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, len -= inc) { if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { pdpe = pmap_large_map_pdpe(va); MPASS(*pdpe == 0); *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); inc = NBPDP; } else if (len >= NBPDR && (pa & PDRMASK) == 0 && (va & PDRMASK) == 0) { pde = pmap_large_map_pde(va); MPASS(*pde == 0); *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> wire_count++; inc = NBPDR; } else { pte = pmap_large_map_pte(va); MPASS(*pte == 0); *pte = pa | pg_g | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, FALSE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> wire_count++; inc = PAGE_SIZE; } } PMAP_UNLOCK(kernel_pmap); MPASS(len == 0); *addr = (void *)vmem_res; return (0); } void pmap_large_unmap(void *svaa, vm_size_t len) { vm_offset_t sva, va; vm_size_t inc; pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte; vm_page_t m; struct spglist spgf; sva = (vm_offset_t)svaa; if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) return; SLIST_INIT(&spgf); KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); PMAP_LOCK(kernel_pmap); for (va = sva; va < sva + len; va += inc) { pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT((va & PDPMASK) == 0, ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT(va + NBPDP <= sva + len, ("unmap covers partial 1GB page, sva %#lx va %#lx " "pdpe %#lx pdp %#lx len %#lx", sva, va, (u_long)pdpe, pdp, len)); *pdpe = 0; inc = NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) { KASSERT((va & PDRMASK) == 0, ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); KASSERT(va + NBPDR <= sva + len, ("unmap covers partial 2MB page, sva %#lx va %#lx " "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, pd, len)); pde_store(pde, 0); inc = NBPDR; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->wire_count--; if (m->wire_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } continue; } pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); inc = PAGE_SIZE; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); m->wire_count--; if (m->wire_count == 0) { *pde = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->wire_count--; if (m->wire_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } } } pmap_invalidate_range(kernel_pmap, sva, sva + len); PMAP_UNLOCK(kernel_pmap); vm_page_free_pages_toq(&spgf, false); vmem_free(large_vmem, sva, len); } static void pmap_large_map_wb_fence_mfence(void) { mfence(); } static void pmap_large_map_wb_fence_sfence(void) { sfence(); } static void pmap_large_map_wb_fence_nop(void) { } DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) { if (cpu_vendor_id != CPU_VENDOR_INTEL) return (pmap_large_map_wb_fence_mfence); else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | CPUID_STDEXT_CLFLUSHOPT)) == 0) return (pmap_large_map_wb_fence_sfence); else /* clflush is strongly enough ordered */ return (pmap_large_map_wb_fence_nop); } static void pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clwb(va); } static void pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflushopt(va); } static void pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflush(va); } static void pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) { } DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) { if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) return (pmap_large_map_flush_range_clwb); else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) return (pmap_large_map_flush_range_clflushopt); else if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_large_map_flush_range_clflush); else return (pmap_large_map_flush_range_nop); } static void pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) { volatile u_long *pe; u_long p; vm_offset_t va; vm_size_t inc; bool seen_other; for (va = sva; va < eva; va += inc) { inc = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { pe = (volatile u_long *)pmap_large_map_pdpe(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDP; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pde(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDR; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pte(va); p = *pe; inc = PAGE_SIZE; } seen_other = false; for (;;) { if ((p & X86_PG_AVAIL1) != 0) { /* * Spin-wait for the end of a parallel * write-back. */ cpu_spinwait(); p = *pe; /* * If we saw other write-back * occuring, we cannot rely on PG_M to * indicate state of the cache. The * PG_M bit is cleared before the * flush to avoid ignoring new writes, * and writes which are relevant for * us might happen after. */ seen_other = true; continue; } if ((p & X86_PG_M) != 0 || seen_other) { if (!atomic_fcmpset_long(pe, &p, (p & ~X86_PG_M) | X86_PG_AVAIL1)) /* * If we saw PG_M without * PG_AVAIL1, and then on the * next attempt we do not * observe either PG_M or * PG_AVAIL1, the other * write-back started after us * and finished before us. We * can rely on it doing our * work. */ continue; pmap_large_map_flush_range(va, inc); atomic_clear_long(pe, X86_PG_AVAIL1); } break; } maybe_yield(); } } /* * Write-back cache lines for the given address range. * * Must be called only on the range or sub-range returned from * pmap_large_map(). Must not be called on the coalesced ranges. * * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH * instructions support. */ void pmap_large_map_wb(void *svap, vm_size_t len) { vm_offset_t eva, sva; sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { pmap_large_map_flush_range(sva, len); } else { KASSERT(sva >= LARGEMAP_MIN_ADDRESS && eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } pmap_large_map_wb_fence(); } static vm_page_t pmap_pti_alloc_page(void) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); return (m); } static bool pmap_pti_free_page(vm_page_t m) { KASSERT(m->wire_count > 0, ("page %p not wired", m)); if (!vm_page_unwire_noq(m)) return (false); vm_page_free_zero(m); return (true); } static void pmap_pti_init(void) { vm_page_t pml4_pg; pdp_entry_t *pdpe; vm_offset_t va; int i; if (!pti) return; pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + sizeof(struct gate_descriptor) * NIDT, false); pmap_pti_add_kva_locked((vm_offset_t)common_tss, (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); CPU_FOREACH(i) { /* Doublefault stack IST 1 */ va = common_tss[i].tss_ist1; pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* NMI stack IST 2 */ va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* MC# stack IST 3 */ va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* DB# stack IST 4 */ va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, (vm_offset_t)etext, true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); static pdp_entry_t * pmap_pti_pdpe(vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; vm_page_t m; vm_pindex_t pml4_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pml4_idx = pmap_pml4e_index(va); pml4e = &pti_pml4[pml4_idx]; m = NULL; if (*pml4e == 0) { if (pti_finalized) panic("pml4 alloc after finalization\n"); m = pmap_pti_alloc_page(); if (*pml4e != 0) { pmap_pti_free_page(m); mphys = *pml4e & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pml4e = mphys | X86_PG_RW | X86_PG_V; } } else { mphys = *pml4e & ~PAGE_MASK; } pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); return (pdpe); } static void pmap_pti_wire_pte(void *pte) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); m->wire_count++; } static void pmap_pti_unwire_pde(void *pde, bool only_ref) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); MPASS(m->wire_count > 0); MPASS(only_ref || m->wire_count > 1); pmap_pti_free_page(m); } static void pmap_pti_unwire_pte(void *pte, vm_offset_t va) { vm_page_t m; pd_entry_t *pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); MPASS(m->wire_count > 0); if (pmap_pti_free_page(m)) { pde = pmap_pti_pde(va); MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); *pde = 0; pmap_pti_unwire_pde(pde, false); } } static pd_entry_t * pmap_pti_pde(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_page_t m; vm_pindex_t pd_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pdpe = pmap_pti_pdpe(va); if (*pdpe == 0) { m = pmap_pti_alloc_page(); if (*pdpe != 0) { pmap_pti_free_page(m); MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_RW | X86_PG_V; } } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); pd_idx = pmap_pde_index(va); pde += pd_idx; return (pde); } static pt_entry_t * pmap_pti_pte(vm_offset_t va, bool *unwire_pde) { pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pde = pmap_pti_pde(va); if (unwire_pde != NULL) { *unwire_pde = true; pmap_pti_wire_pte(pde); } if (*pde == 0) { m = pmap_pti_alloc_page(); if (*pde != 0) { pmap_pti_free_page(m); MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } else { mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_RW | X86_PG_V; if (unwire_pde != NULL) *unwire_pde = false; } } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); pte += pmap_pte_index(va); return (pte); } static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte, ptev; bool unwire_pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); sva = trunc_page(sva); MPASS(sva > VM_MAXUSER_ADDRESS); eva = round_page(eva); MPASS(sva < eva); for (; sva < eva; sva += PAGE_SIZE) { pte = pmap_pti_pte(sva, &unwire_pde); pa = pmap_kextract(sva); ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); if (*pte == 0) { pte_store(pte, ptev); pmap_pti_wire_pte(pte); } else { KASSERT(!pti_finalized, ("pti overlap after fin %#lx %#lx %#lx", sva, *pte, ptev)); KASSERT(*pte == ptev, ("pti non-identical pte after fin %#lx %#lx %#lx", sva, *pte, ptev)); } if (unwire_pde) { pde = pmap_pti_pde(sva); pmap_pti_unwire_pde(pde, true); } } } void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) { if (!pti) return; VM_OBJECT_WLOCK(pti_obj); pmap_pti_add_kva_locked(sva, eva, exec); VM_OBJECT_WUNLOCK(pti_obj); } void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) { pt_entry_t *pte; vm_offset_t va; if (!pti) return; sva = rounddown2(sva, PAGE_SIZE); MPASS(sva > VM_MAXUSER_ADDRESS); eva = roundup2(eva, PAGE_SIZE); MPASS(sva < eva); VM_OBJECT_WLOCK(pti_obj); for (va = sva; va < eva; va += PAGE_SIZE) { pte = pmap_pti_pte(va, NULL); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); pmap_pti_unwire_pte(pte, va); } pmap_invalidate_range(kernel_pmap, sva, eva); VM_OBJECT_WUNLOCK(pti_obj); } static void * pkru_dup_range(void *ctx __unused, void *data) { struct pmap_pkru_range *node, *new_node; new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (new_node == NULL) return (NULL); node = data; memcpy(new_node, node, sizeof(*node)); return (new_node); } static void pkru_free_range(void *ctx __unused, void *node) { uma_zfree(pmap_pkru_ranges_zone, node); } static int pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { struct pmap_pkru_range *ppr; int error; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if ((flags & AMD64_PKRU_EXCL) != 0 && !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) return (EBUSY); ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (ppr == NULL) return (ENOMEM); ppr->pkru_keyidx = keyidx; ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); if (error != 0) uma_zfree(pmap_pkru_ranges_zone, ppr); return (error); } static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); return (rangeset_remove(&pmap->pm_pkru, sva, eva)); } static void pmap_pkru_deassign_all(pmap_t pmap) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_remove_all(&pmap->pm_pkru); } static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_pkru_range *ppr, *prev_ppr; vm_offset_t va; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || sva >= VM_MAXUSER_ADDRESS) return (true); MPASS(eva <= VM_MAXUSER_ADDRESS); for (va = sva, prev_ppr = NULL; va < eva;) { ppr = rangeset_lookup(&pmap->pm_pkru, va); if ((ppr == NULL) ^ (prev_ppr == NULL)) return (false); if (ppr == NULL) { va += PAGE_SIZE; continue; } if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) return (false); va = ppr->pkru_rs_el.re_end; } return (true); } static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va) { struct pmap_pkru_range *ppr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || va >= VM_MAXUSER_ADDRESS) return (0); ppr = rangeset_lookup(&pmap->pm_pkru, va); if (ppr != NULL) return (X86_PG_PKU(ppr->pkru_keyidx)); return (0); } static bool pred_pkru_on_remove(void *ctx __unused, void *r) { struct pmap_pkru_range *ppr; ppr = r; return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); } static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_remove_pred(&pmap->pm_pkru, sva, eva, pred_pkru_on_remove); } } static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) { PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); MPASS(dst_pmap->pm_type == PT_X86); MPASS(src_pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if (src_pmap->pm_pkru.rs_data_ctx == NULL) return (0); return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); } static void pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t newpde, ptpaddr, *pde; pt_entry_t newpte, *ptep, pte; vm_offset_t va, va_next; bool changed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS(keyidx <= PMAP_MAX_PKRU_IDX); for (changed = false, va = sva; va < eva; va = va_next) { pml4e = pmap_pml4e(pmap, va); if ((*pml4e & X86_PG_V) == 0) { va_next = (va + NBPML4) & ~PML4MASK; if (va_next < va) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, va); if ((*pdpe & X86_PG_V) == 0) { va_next = (va + NBPDP) & ~PDPMASK; if (va_next < va) va_next = eva; continue; } va_next = (va + NBPDR) & ~PDRMASK; if (va_next < va) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, va); ptpaddr = *pde; if (ptpaddr == 0) continue; MPASS((ptpaddr & X86_PG_V) != 0); if ((ptpaddr & PG_PS) != 0) { if (va + NBPDR == va_next && eva >= va_next) { newpde = (ptpaddr & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpde != ptpaddr) { *pde = newpde; changed = true; } continue; } else if (!pmap_demote_pde(pmap, pde, va)) { continue; } } if (va_next > eva) va_next = eva; for (ptep = pmap_pde_to_pte(pde, va); va != va_next; ptep++, va += PAGE_SIZE) { pte = *ptep; if ((pte & X86_PG_V) == 0) continue; newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpte != pte) { *ptep = newpte; changed = true; } } } if (changed) pmap_invalidate_range(pmap, sva, eva); } static int pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) return (EINVAL); if (eva <= sva || eva > VM_MAXUSER_ADDRESS) return (EFAULT); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (ENOTSUP); return (0); } int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, keyidx); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_deassign(pmap, sva, eva); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, 0); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int ptes; int pdes; int pdpes; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int i, pat_idx; if (eva <= range->sva) return; pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); for (i = 0; i < PAT_INDEX_SIZE; i++) if (pat_index[i] == pat_idx) break; switch (i) { case PAT_WRITE_BACK: mode = "WB"; break; case PAT_WRITE_THROUGH: mode = "WT"; break; case PAT_UNCACHEABLE: mode = "UC"; break; case PAT_WRITE_PROTECTED: mode = "WP"; break; case PAT_WRITE_COMBINING: mode = "WC"; break; default: printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n", __func__, i, range->sva, eva); mode = "??"; break; } sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', (range->attrs & pg_nx) != 0 ? '-' : 'x', (range->attrs & X86_PG_U) != 0 ? 'u' : 's', (range->attrs & X86_PG_G) != 0 ? 'g' : '-', mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. This is not quite as simple as a direct * flag comparison since some PAT modes have multiple representations. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { pt_entry_t diff, mask; mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; diff = (range->attrs ^ attrs) & mask; if (diff == 0) return (true); if ((diff & ~X86_PG_PDE_PAT) == 0 && pmap_pat_index(kernel_pmap, range->attrs, true) == pmap_pat_index(kernel_pmap, attrs, true)) return (true); return (false); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); attrs |= pdpe & pg_nx; attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); if ((pdpe & PG_PS) != 0) { attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pde != 0) { attrs |= pde & pg_nx; attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); } if ((pde & PG_PS) != 0) { attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pte != 0) { attrs |= pte & pg_nx; attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); /* Canonicalize by always using the PDE PAT bit. */ if ((attrs & X86_PG_PTE_PAT) != 0) attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; } if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pml4_entry_t pml4e; pdp_entry_t *pdp, pdpe; pd_entry_t *pd, pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k, l; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Outside of the large map, kernel page table pages are never * freed, so at worst we will observe inconsistencies in the output. * Within the large map, ensure that PDP and PD page addresses are * valid before descending. */ for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { switch (i) { case PML4PML4I: sbuf_printf(sb, "\nRecursive map:\n"); break; case DMPML4I: sbuf_printf(sb, "\nDirect map:\n"); break; case KPML4BASE: sbuf_printf(sb, "\nKernel map:\n"); break; case LMSPML4I: sbuf_printf(sb, "\nLarge map:\n"); break; } /* Convert to canonical form. */ if (sva == 1ul << 47) sva |= -1ul << 48; restart: pml4e = kernel_pmap->pm_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); sva += NBPML4; continue; } pa = pml4e & PG_FRAME; pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { pdpe = pdp[j]; if ((pdpe & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDP; continue; } pa = pdpe & PG_FRAME; if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) goto restart; if ((pdpe & PG_PS) != 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 0, 0); range.pdpes++; sva += NBPDP; continue; } pd = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_pde_index(sva); k < NPDEPG; k++) { pde = pd[k]; if ((pde & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDR; continue; } pa = pde & PG_FRAME; if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) goto restart; if ((pde & PG_PS) != 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, 0); range.pdes++; sva += NBPDR; continue; } pt = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = pmap_pte_index(sva); l < NPTEPG; l++, sva += PAGE_SIZE) { pte = pt[l]; if ((pte & X86_PG_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, pte); range.ptes++; } } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); #ifdef DDB DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); pml4 = pmap_pml4e(pmap, va); db_printf("VA %#016lx pml4e %#016lx", va, *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); db_printf(" pdpe %#016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); db_printf(" pde %#016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); db_printf(" pte %#016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } #endif Index: head/sys/amd64/include/pmap.h =================================================================== --- head/sys/amd64/include/pmap.h (revision 352406) +++ head/sys/amd64/include/pmap.h (revision 352407) @@ -1,513 +1,512 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 * $FreeBSD$ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ /* * Page-directory and page-table entries follow this format, with a few * of the fields not present here and there, depending on a lot of things. */ /* ---- Intel Nomenclature ---- */ #define X86_PG_V 0x001 /* P Valid */ #define X86_PG_RW 0x002 /* R/W Read/Write */ #define X86_PG_U 0x004 /* U/S User/Supervisor */ #define X86_PG_NC_PWT 0x008 /* PWT Write through */ #define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ #define X86_PG_A 0x020 /* A Accessed */ #define X86_PG_M 0x040 /* D Dirty */ #define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ #define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ #define X86_PG_G 0x100 /* G Global */ #define X86_PG_AVAIL1 0x200 /* / Available for system */ #define X86_PG_AVAIL2 0x400 /* < programmers use */ #define X86_PG_AVAIL3 0x800 /* \ */ #define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ #define X86_PG_PKU(idx) ((pt_entry_t)idx << 59) #define X86_PG_NX (1ul<<63) /* No-execute */ #define X86_PG_AVAIL(x) (1ul << (x)) /* Page level cache control fields used to determine the PAT type */ #define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) #define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) /* Protection keys indexes */ #define PMAP_MAX_PKRU_IDX 0xf #define X86_PG_PKU_MASK X86_PG_PKU(PMAP_MAX_PKRU_IDX) /* * Intel extended page table (EPT) bit definitions. */ #define EPT_PG_READ 0x001 /* R Read */ #define EPT_PG_WRITE 0x002 /* W Write */ #define EPT_PG_EXECUTE 0x004 /* X Execute */ #define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ #define EPT_PG_PS 0x080 /* PS Page size */ #define EPT_PG_A 0x100 /* A Accessed */ #define EPT_PG_M 0x200 /* D Dirty */ #define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ /* * Define the PG_xx macros in terms of the bits on x86 PTEs. */ #define PG_V X86_PG_V #define PG_RW X86_PG_RW #define PG_U X86_PG_U #define PG_NC_PWT X86_PG_NC_PWT #define PG_NC_PCD X86_PG_NC_PCD #define PG_A X86_PG_A #define PG_M X86_PG_M #define PG_PS X86_PG_PS #define PG_PTE_PAT X86_PG_PTE_PAT #define PG_G X86_PG_G #define PG_AVAIL1 X86_PG_AVAIL1 #define PG_AVAIL2 X86_PG_AVAIL2 #define PG_AVAIL3 X86_PG_AVAIL3 #define PG_PDE_PAT X86_PG_PDE_PAT #define PG_NX X86_PG_NX #define PG_PDE_CACHE X86_PG_PDE_CACHE #define PG_PTE_CACHE X86_PG_PTE_CACHE /* Our various interpretations of the above */ #define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ #define PG_MANAGED X86_PG_AVAIL2 #define EPT_PG_EMUL_V X86_PG_AVAIL(52) #define EPT_PG_EMUL_RW X86_PG_AVAIL(53) #define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */ #define PG_FRAME (0x000ffffffffff000ul) #define PG_PS_FRAME (0x000fffffffe00000ul) #define PG_PS_PDP_FRAME (0x000fffffc0000000ul) /* * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB * (PTE) page mappings have identical settings for the following fields: */ #define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ PG_M | PG_A | PG_U | PG_RW | PG_V | PG_PKU_MASK) /* * Page Protection Exception bits */ #define PGEX_P 0x01 /* Protection violation vs. not present */ #define PGEX_W 0x02 /* during a Write cycle */ #define PGEX_U 0x04 /* access from User mode (UPL) */ #define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ #define PGEX_I 0x10 /* during an instruction fetch */ #define PGEX_PK 0x20 /* protection key violation */ #define PGEX_SGX 0x8000 /* SGX-related */ /* * undef the PG_xx macros that define bits in the regular x86 PTEs that * have a different position in nested PTEs. This is done when compiling * code that needs to be aware of the differences between regular x86 and * nested PTEs. * * The appropriate bitmask will be calculated at runtime based on the pmap * type. */ #ifdef AMD64_NPT_AWARE #undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ #undef PG_G #undef PG_A #undef PG_M #undef PG_PDE_PAT #undef PG_PDE_CACHE #undef PG_PTE_PAT #undef PG_PTE_CACHE #undef PG_RW #undef PG_V #endif /* * Pte related macros. This is complicated by having to deal with * the sign extension of the 48th bit. */ #define KVADDR(l4, l3, l2, l1) ( \ ((unsigned long)-1 << 47) | \ ((unsigned long)(l4) << PML4SHIFT) | \ ((unsigned long)(l3) << PDPSHIFT) | \ ((unsigned long)(l2) << PDRSHIFT) | \ ((unsigned long)(l1) << PAGE_SHIFT)) #define UVADDR(l4, l3, l2, l1) ( \ ((unsigned long)(l4) << PML4SHIFT) | \ ((unsigned long)(l3) << PDPSHIFT) | \ ((unsigned long)(l2) << PDRSHIFT) | \ ((unsigned long)(l1) << PAGE_SHIFT)) /* * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, * but setting it larger than NDMPML4E makes no sense. * * Each slot provides .5 TB of kernel virtual space. */ #define NKPML4E 4 #define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ #define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ #define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ /* * NDMPML4E is the maximum number of PML4 entries that will be * used to implement the direct map. It must be a power of two, * and should generally exceed NKPML4E. The maximum possible * value is 64; using 128 will make the direct map intrude into * the recursive page table map. */ #define NDMPML4E 8 /* * NPAPML4E is the maximum number of PML4 entries that will be * used to implement the page array. This should be roughly 3% of * NPDPML4E owing to 3% overhead for struct vm_page. */ #define NPAPML4E 1 /* * These values control the layout of virtual memory. The starting address * of the direct map, which is controlled by DMPML4I, must be a multiple of * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) * * Note: KPML4I is the index of the (single) level 4 page that maps * the KVA that holds KERNBASE, while KPML4BASE is the index of the * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to * KERNBASE. * * (KPML4I combines with KPDPI to choose where KERNBASE starts. * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, * and KPDPI provides bits 30..38.) */ #define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ #define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ #define PAPML4I (KPML4BASE-1-NPAPML4E) /* Below KVM */ #define DMPML4I rounddown(PAPML4I-NDMPML4E, NDMPML4E) /* Below pages */ #define KPML4I (NPML4EPG-1) #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ /* Large map: index of the first and max last pml4 entry */ #define LMSPML4I (PML4PML4I + 1) #define LMEPML4I (DMPML4I - 1) /* * XXX doesn't really belong here I guess... */ #define ISA_HOLE_START 0xa0000 #define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) #define PMAP_PCID_NONE 0xffffffff #define PMAP_PCID_KERN 0 #define PMAP_PCID_OVERMAX 0x1000 #define PMAP_PCID_OVERMAX_KERN 0x800 #define PMAP_PCID_USER_PT 0x800 #define PMAP_NO_CR3 (~0UL) #ifndef LOCORE #include #include #include #include #include #include #include typedef u_int64_t pd_entry_t; typedef u_int64_t pt_entry_t; typedef u_int64_t pdp_entry_t; typedef u_int64_t pml4_entry_t; /* * Address of current address space page table maps and directories. */ #ifdef _KERNEL #define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) #define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) #define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) #define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) #define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) #define PTmap ((pt_entry_t *)(addr_PTmap)) #define PDmap ((pd_entry_t *)(addr_PDmap)) #define PDPmap ((pd_entry_t *)(addr_PDPmap)) #define PML4map ((pd_entry_t *)(addr_PML4map)) #define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) extern int nkpt; /* Initial number of kernel page tables */ extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ /* * virtual address to page table entry and * to physical address. * Note: these work recursively, thus vtopte of a pte will give * the corresponding pde that in turn maps it. */ pt_entry_t *vtopte(vm_offset_t); #define vtophys(va) pmap_kextract(((vm_offset_t) (va))) #define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) #define pte_load_clear(ptep) atomic_swap_long(ptep, 0) #define pte_store(ptep, pte) do { \ *(u_long *)(ptep) = (u_long)(pte); \ } while (0) #define pte_clear(ptep) pte_store(ptep, 0) #define pde_store(pdep, pde) pte_store(pdep, pde) extern pt_entry_t pg_nx; #endif /* _KERNEL */ /* * Pmap stuff */ struct pv_entry; struct pv_chunk; /* * Locks * (p) PV list lock */ struct md_page { TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ int pv_gen; /* (p) */ int pat_mode; }; enum pmap_type { PT_X86, /* regular x86 page tables */ PT_EPT, /* Intel's nested page tables */ PT_RVI, /* AMD's nested page tables */ }; struct pmap_pcids { uint32_t pm_pcid; uint32_t pm_gen; }; /* * The kernel virtual address (KVA) of the level 4 page table page is always * within the direct map (DMAP) region. */ struct pmap { struct mtx pm_mtx; pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */ uint64_t pm_cr3; uint64_t pm_ucr3; TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ enum pmap_type pm_type; /* regular or nested tables */ struct pmap_statistics pm_stats; /* pmap statistics */ struct vm_radix pm_root; /* spare page table pages */ long pm_eptgen; /* EPT pmap generation id */ int pm_flags; struct pmap_pcids pm_pcids[MAXCPU]; struct rangeset pm_pkru; }; /* flags */ #define PMAP_NESTED_IPIMASK 0xff #define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ #define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ #define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ typedef struct pmap *pmap_t; #ifdef _KERNEL extern struct pmap kernel_pmap_store; #define kernel_pmap (&kernel_pmap_store) #define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) #define PMAP_LOCK_ASSERT(pmap, type) \ mtx_assert(&(pmap)->pm_mtx, (type)) #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) #define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ NULL, MTX_DEF | MTX_DUPOK) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); #endif /* * For each vm_page_t, there is a list of all currently valid virtual * mappings of that page. An entry is a pv_entry_t, the list is pv_list. */ typedef struct pv_entry { vm_offset_t pv_va; /* virtual address for mapping */ TAILQ_ENTRY(pv_entry) pv_next; } *pv_entry_t; /* * pv_entries are allocated in chunks per-process. This avoids the * need to track per-pmap assignments. */ #define _NPCM 3 #define _NPCPV 168 #define PV_CHUNK_HEADER \ pmap_t pc_pmap; \ TAILQ_ENTRY(pv_chunk) pc_list; \ uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ \ TAILQ_ENTRY(pv_chunk) pc_lru; struct pv_chunk_header { PV_CHUNK_HEADER }; struct pv_chunk { PV_CHUNK_HEADER struct pv_entry pc_pventry[_NPCPV]; }; #ifdef _KERNEL extern caddr_t CADDR1; extern pt_entry_t *CMAP1; extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; extern vm_paddr_t dmaplimit; extern int pmap_pcid_enabled; extern int invpcid_works; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) -#define pmap_page_is_write_mapped(m) \ - (((m)->astate.flags & PGA_WRITEABLE) != 0) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) struct thread; void pmap_activate_boot(pmap_t pmap); void pmap_activate_sw(struct thread *); void pmap_bootstrap(vm_paddr_t *); int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); int pmap_change_attr(vm_offset_t, vm_size_t, int); void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); void pmap_flush_cache_range(vm_offset_t, vm_offset_t); void pmap_flush_cache_phys_range(vm_paddr_t, vm_paddr_t, vm_memattr_t); void pmap_init_pat(void); void pmap_kenter(vm_offset_t va, vm_paddr_t pa); void *pmap_kenter_temporary(vm_paddr_t pa, int i); vm_paddr_t pmap_kextract(vm_offset_t); void pmap_kremove(vm_offset_t); int pmap_large_map(vm_paddr_t, vm_size_t, void **, vm_memattr_t); void pmap_large_map_wb(void *sva, vm_size_t len); void pmap_large_unmap(void *sva, vm_size_t len); void *pmap_mapbios(vm_paddr_t, vm_size_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); void *pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size); bool pmap_not_in_di(void); boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); void pmap_pinit_pml4(vm_page_t); bool pmap_ps_enabled(pmap_t pmap); void pmap_unmapdev(vm_offset_t, vm_size_t); void pmap_invalidate_page(pmap_t, vm_offset_t); void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); void pmap_invalidate_all(pmap_t); void pmap_invalidate_cache(void); void pmap_invalidate_cache_pages(vm_page_t *pages, int count); void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva); void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec); void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva); void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3); void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, vm_offset_t eva); int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags); void pmap_thread_init_invl_gen(struct thread *td); int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap); void pmap_page_array_startup(long count); #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ static __inline vm_pindex_t pmap_pte_index(vm_offset_t va) { return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pde_index(vm_offset_t va) { return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pdpe_index(vm_offset_t va) { return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pml4e_index(vm_offset_t va) { return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/arm/arm/pmap-v4.c =================================================================== --- head/sys/arm/arm/pmap-v4.c (revision 352406) +++ head/sys/arm/arm/pmap-v4.c (revision 352407) @@ -1,4474 +1,4474 @@ /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */ /*- * Copyright 2004 Olivier Houchard. * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2002-2003 Wasabi Systems, Inc. * Copyright (c) 2001 Richard Earnshaw * Copyright (c) 2001-2002 Christopher Gilbert * All rights reserved. * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Mark Brinicombe. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * * RiscBSD kernel project * * pmap.c * * Machine dependent vm stuff * * Created : 20/09/94 */ /* * Special compilation symbols * PMAP_DEBUG - Build in pmap_debug_level code * * Note that pmap_mapdev() and pmap_unmapdev() are implemented in arm/devmap.c */ /* Include header files */ #include "opt_vm.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PMAP_DEBUG #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 0; #define PMAP_INLINE #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #define PMAP_INLINE __inline #endif /* PMAP_DEBUG */ extern struct pv_addr systempage; extern int last_fault_code; #define l1pte_section_p(pde) (((pde) & L1_TYPE_MASK) == L1_TYPE_S) #define l2pte_index(v) (((v) & L1_S_OFFSET) >> L2_S_SHIFT) #define l2pte_valid(pte) ((pte) != 0) #define l2pte_pa(pte) ((pte) & L2_S_FRAME) /* * Internal function prototypes */ static void pmap_free_pv_entry (pv_entry_t); static pv_entry_t pmap_get_pv_entry(void); static int pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int); static vm_paddr_t pmap_extract_locked(pmap_t pmap, vm_offset_t va); static void pmap_fix_cache(struct vm_page *, pmap_t, vm_offset_t); static void pmap_alloc_l1(pmap_t); static void pmap_free_l1(pmap_t); static int pmap_clearbit(struct vm_page *, u_int); static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t); static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t); static void pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int); static vm_offset_t kernel_pt_lookup(vm_paddr_t); static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1"); vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t pmap_curmaxkvaddr; vm_paddr_t kernel_l1pa; vm_offset_t kernel_vm_end = 0; vm_offset_t vm_max_kernel_address; struct pmap kernel_pmap_store; static pt_entry_t *csrc_pte, *cdst_pte; static vm_offset_t csrcp, cdstp, qmap_addr; static struct mtx cmtx, qmap_mtx; static void pmap_init_l1(struct l1_ttable *, pd_entry_t *); /* * These routines are called when the CPU type is identified to set up * the PTE prototypes, cache modes, etc. * * The variables are always here, just in case LKMs need to reference * them (though, they shouldn't). */ static pt_entry_t pte_l1_s_cache_mode; static pt_entry_t pte_l1_s_cache_mode_pt; static pt_entry_t pte_l1_s_cache_mask; static pt_entry_t pte_l2_l_cache_mode; static pt_entry_t pte_l2_l_cache_mode_pt; static pt_entry_t pte_l2_l_cache_mask; static pt_entry_t pte_l2_s_cache_mode; static pt_entry_t pte_l2_s_cache_mode_pt; static pt_entry_t pte_l2_s_cache_mask; /* * Crashdump maps. */ static caddr_t crashdumpmap; extern void bcopy_page(vm_offset_t, vm_offset_t); extern void bzero_page(vm_offset_t); extern vm_offset_t alloc_firstaddr; char *_tmppt; /* * Metadata for L1 translation tables. */ struct l1_ttable { /* Entry on the L1 Table list */ SLIST_ENTRY(l1_ttable) l1_link; /* Entry on the L1 Least Recently Used list */ TAILQ_ENTRY(l1_ttable) l1_lru; /* Track how many domains are allocated from this L1 */ volatile u_int l1_domain_use_count; /* * A free-list of domain numbers for this L1. * We avoid using ffs() and a bitmap to track domains since ffs() * is slow on ARM. */ u_int8_t l1_domain_first; u_int8_t l1_domain_free[PMAP_DOMAINS]; /* Physical address of this L1 page table */ vm_paddr_t l1_physaddr; /* KVA of this L1 page table */ pd_entry_t *l1_kva; }; /* * Convert a virtual address into its L1 table index. That is, the * index used to locate the L2 descriptor table pointer in an L1 table. * This is basically used to index l1->l1_kva[]. * * Each L2 descriptor table represents 1MB of VA space. */ #define L1_IDX(va) (((vm_offset_t)(va)) >> L1_S_SHIFT) /* * L1 Page Tables are tracked using a Least Recently Used list. * - New L1s are allocated from the HEAD. * - Freed L1s are added to the TAIl. * - Recently accessed L1s (where an 'access' is some change to one of * the userland pmaps which owns this L1) are moved to the TAIL. */ static TAILQ_HEAD(, l1_ttable) l1_lru_list; /* * A list of all L1 tables */ static SLIST_HEAD(, l1_ttable) l1_list; static struct mtx l1_lru_lock; /* * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots. * * This is normally 16MB worth L2 page descriptors for any given pmap. * Reference counts are maintained for L2 descriptors so they can be * freed when empty. */ struct l2_dtable { /* The number of L2 page descriptors allocated to this l2_dtable */ u_int l2_occupancy; /* List of L2 page descriptors */ struct l2_bucket { pt_entry_t *l2b_kva; /* KVA of L2 Descriptor Table */ vm_paddr_t l2b_phys; /* Physical address of same */ u_short l2b_l1idx; /* This L2 table's L1 index */ u_short l2b_occupancy; /* How many active descriptors */ } l2_bucket[L2_BUCKET_SIZE]; }; /* pmap_kenter_internal flags */ #define KENTER_CACHE 0x1 #define KENTER_USER 0x2 /* * Given an L1 table index, calculate the corresponding l2_dtable index * and bucket index within the l2_dtable. */ #define L2_IDX(l1idx) (((l1idx) >> L2_BUCKET_LOG2) & \ (L2_SIZE - 1)) #define L2_BUCKET(l1idx) ((l1idx) & (L2_BUCKET_SIZE - 1)) /* * Given a virtual address, this macro returns the * virtual address required to drop into the next L2 bucket. */ #define L2_NEXT_BUCKET(va) (((va) & L1_S_FRAME) + L1_S_SIZE) /* * We try to map the page tables write-through, if possible. However, not * all CPUs have a write-through cache mode, so on those we have to sync * the cache when we frob page tables. * * We try to evaluate this at compile time, if possible. However, it's * not always possible to do that, hence this run-time var. */ int pmap_needs_pte_sync; /* * Macro to determine if a mapping might be resident in the * instruction cache and/or TLB */ #define PV_BEEN_EXECD(f) (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC)) /* * Macro to determine if a mapping might be resident in the * data cache and/or TLB */ #define PV_BEEN_REFD(f) (((f) & PVF_REF) != 0) #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #define pmap_is_current(pm) ((pm) == kernel_pmap || \ curproc->p_vmspace->vm_map.pmap == (pm)) static uma_zone_t pvzone = NULL; uma_zone_t l2zone; static uma_zone_t l2table_zone; static vm_offset_t pmap_kernel_l2dtable_kva; static vm_offset_t pmap_kernel_l2ptp_kva; static vm_paddr_t pmap_kernel_l2ptp_phys; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static struct rwlock pvh_global_lock; void pmap_copy_page_offs_generic(vm_paddr_t a_phys, vm_offset_t a_offs, vm_paddr_t b_phys, vm_offset_t b_offs, int cnt); /* * This list exists for the benefit of pmap_map_chunk(). It keeps track * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can * find them as necessary. * * Note that the data on this list MUST remain valid after initarm() returns, * as pmap_bootstrap() uses it to contruct L2 table metadata. */ SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list); static void pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt) { int i; l1->l1_kva = l1pt; l1->l1_domain_use_count = 0; l1->l1_domain_first = 0; for (i = 0; i < PMAP_DOMAINS; i++) l1->l1_domain_free[i] = i + 1; /* * Copy the kernel's L1 entries to each new L1. */ if (l1pt != kernel_pmap->pm_l1->l1_kva) memcpy(l1pt, kernel_pmap->pm_l1->l1_kva, L1_TABLE_SIZE); if ((l1->l1_physaddr = pmap_extract(kernel_pmap, (vm_offset_t)l1pt)) == 0) panic("pmap_init_l1: can't get PA of L1 at %p", l1pt); SLIST_INSERT_HEAD(&l1_list, l1, l1_link); TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); } static vm_offset_t kernel_pt_lookup(vm_paddr_t pa) { struct pv_addr *pv; SLIST_FOREACH(pv, &kernel_pt_list, pv_list) { if (pv->pv_pa == pa) return (pv->pv_va); } return (0); } void pmap_pte_init_generic(void) { pte_l1_s_cache_mode = L1_S_B|L1_S_C; pte_l1_s_cache_mask = L1_S_CACHE_MASK; pte_l2_l_cache_mode = L2_B|L2_C; pte_l2_l_cache_mask = L2_L_CACHE_MASK; pte_l2_s_cache_mode = L2_B|L2_C; pte_l2_s_cache_mask = L2_S_CACHE_MASK; /* * If we have a write-through cache, set B and C. If * we have a write-back cache, then we assume setting * only C will make those pages write-through. */ if (cpufuncs.cf_dcache_wb_range == (void *) cpufunc_nullop) { pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C; pte_l2_l_cache_mode_pt = L2_B|L2_C; pte_l2_s_cache_mode_pt = L2_B|L2_C; } else { pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } } /* * Allocate an L1 translation table for the specified pmap. * This is called at pmap creation time. */ static void pmap_alloc_l1(pmap_t pm) { struct l1_ttable *l1; u_int8_t domain; /* * Remove the L1 at the head of the LRU list */ mtx_lock(&l1_lru_lock); l1 = TAILQ_FIRST(&l1_lru_list); TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); /* * Pick the first available domain number, and update * the link to the next number. */ domain = l1->l1_domain_first; l1->l1_domain_first = l1->l1_domain_free[domain]; /* * If there are still free domain numbers in this L1, * put it back on the TAIL of the LRU list. */ if (++l1->l1_domain_use_count < PMAP_DOMAINS) TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); /* * Fix up the relevant bits in the pmap structure */ pm->pm_l1 = l1; pm->pm_domain = domain + 1; } /* * Free an L1 translation table. * This is called at pmap destruction time. */ static void pmap_free_l1(pmap_t pm) { struct l1_ttable *l1 = pm->pm_l1; mtx_lock(&l1_lru_lock); /* * If this L1 is currently on the LRU list, remove it. */ if (l1->l1_domain_use_count < PMAP_DOMAINS) TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); /* * Free up the domain number which was allocated to the pmap */ l1->l1_domain_free[pm->pm_domain - 1] = l1->l1_domain_first; l1->l1_domain_first = pm->pm_domain - 1; l1->l1_domain_use_count--; /* * The L1 now must have at least 1 free domain, so add * it back to the LRU list. If the use count is zero, * put it at the head of the list, otherwise it goes * to the tail. */ if (l1->l1_domain_use_count == 0) { TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru); } else TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); } /* * Returns a pointer to the L2 bucket associated with the specified pmap * and VA, or NULL if no L2 bucket exists for the address. */ static PMAP_INLINE struct l2_bucket * pmap_get_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; u_short l1idx; l1idx = L1_IDX(va); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL || (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL) return (NULL); return (l2b); } /* * Returns a pointer to the L2 bucket associated with the specified pmap * and VA. * * If no L2 bucket exists, perform the necessary allocations to put an L2 * bucket/page table in place. * * Note that if a new L2 bucket/page was allocated, the caller *must* * increment the bucket occupancy counter appropriately *before* * releasing the pmap's lock to ensure no other thread or cpu deallocates * the bucket/page in the meantime. */ static struct l2_bucket * pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; u_short l1idx; l1idx = L1_IDX(va); PMAP_ASSERT_LOCKED(pm); rw_assert(&pvh_global_lock, RA_WLOCKED); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) { /* * No mapping at this address, as there is * no entry in the L1 table. * Need to allocate a new l2_dtable. */ PMAP_UNLOCK(pm); rw_wunlock(&pvh_global_lock); if ((l2 = uma_zalloc(l2table_zone, M_NOWAIT)) == NULL) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); return (NULL); } rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); if (pm->pm_l2[L2_IDX(l1idx)] != NULL) { /* * Someone already allocated the l2_dtable while * we were doing the same. */ uma_zfree(l2table_zone, l2); l2 = pm->pm_l2[L2_IDX(l1idx)]; } else { bzero(l2, sizeof(*l2)); /* * Link it into the parent pmap */ pm->pm_l2[L2_IDX(l1idx)] = l2; } } l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; /* * Fetch pointer to the L2 page table associated with the address. */ if (l2b->l2b_kva == NULL) { pt_entry_t *ptep; /* * No L2 page table has been allocated. Chances are, this * is because we just allocated the l2_dtable, above. */ l2->l2_occupancy++; PMAP_UNLOCK(pm); rw_wunlock(&pvh_global_lock); ptep = uma_zalloc(l2zone, M_NOWAIT); rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); if (l2b->l2b_kva != NULL) { /* We lost the race. */ l2->l2_occupancy--; uma_zfree(l2zone, ptep); return (l2b); } l2b->l2b_phys = vtophys(ptep); if (ptep == NULL) { /* * Oops, no more L2 page tables available at this * time. We may need to deallocate the l2_dtable * if we allocated a new one above. */ l2->l2_occupancy--; if (l2->l2_occupancy == 0) { pm->pm_l2[L2_IDX(l1idx)] = NULL; uma_zfree(l2table_zone, l2); } return (NULL); } l2b->l2b_kva = ptep; l2b->l2b_l1idx = l1idx; } return (l2b); } static PMAP_INLINE void #ifndef PMAP_INCLUDE_PTE_SYNC pmap_free_l2_ptp(pt_entry_t *l2) #else pmap_free_l2_ptp(boolean_t need_sync, pt_entry_t *l2) #endif { #ifdef PMAP_INCLUDE_PTE_SYNC /* * Note: With a write-back cache, we may need to sync this * L2 table before re-using it. * This is because it may have belonged to a non-current * pmap, in which case the cache syncs would have been * skipped when the pages were being unmapped. If the * L2 table were then to be immediately re-allocated to * the *current* pmap, it may well contain stale mappings * which have not yet been cleared by a cache write-back * and so would still be visible to the mmu. */ if (need_sync) PTE_SYNC_RANGE(l2, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); #endif uma_zfree(l2zone, l2); } /* * One or more mappings in the specified L2 descriptor table have just been * invalidated. * * Garbage collect the metadata and descriptor table itself if necessary. * * The pmap lock must be acquired when this is called (not necessary * for the kernel pmap). */ static void pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep; u_short l1idx; /* * Update the bucket's reference count according to how many * PTEs the caller has just invalidated. */ l2b->l2b_occupancy -= count; /* * Note: * * Level 2 page tables allocated to the kernel pmap are never freed * as that would require checking all Level 1 page tables and * removing any references to the Level 2 page table. See also the * comment elsewhere about never freeing bootstrap L2 descriptors. * * We make do with just invalidating the mapping in the L2 table. * * This isn't really a big deal in practice and, in fact, leads * to a performance win over time as we don't need to continually * alloc/free. */ if (l2b->l2b_occupancy > 0 || pm == kernel_pmap) return; /* * There are no more valid mappings in this level 2 page table. * Go ahead and NULL-out the pointer in the bucket, then * free the page table. */ l1idx = l2b->l2b_l1idx; ptep = l2b->l2b_kva; l2b->l2b_kva = NULL; pl1pd = &pm->pm_l1->l1_kva[l1idx]; /* * If the L1 slot matches the pmap's domain * number, then invalidate it. */ l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK); if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) { *pl1pd = 0; PTE_SYNC(pl1pd); } /* * Release the L2 descriptor table back to the pool cache. */ #ifndef PMAP_INCLUDE_PTE_SYNC pmap_free_l2_ptp(ptep); #else pmap_free_l2_ptp(!pmap_is_current(pm), ptep); #endif /* * Update the reference count in the associated l2_dtable */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (--l2->l2_occupancy > 0) return; /* * There are no more valid mappings in any of the Level 1 * slots managed by this l2_dtable. Go ahead and NULL-out * the pointer in the parent pmap and free the l2_dtable. */ pm->pm_l2[L2_IDX(l1idx)] = NULL; uma_zfree(l2table_zone, l2); } /* * Pool cache constructors for L2 descriptor tables, metadata and pmap * structures. */ static int pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags) { #ifndef PMAP_INCLUDE_PTE_SYNC struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK; /* * The mappings for these page tables were initially made using * pmap_kenter() by the pool subsystem. Therefore, the cache- * mode will not be right for page table mappings. To avoid * polluting the pmap_kenter() code with a special case for * page tables, we simply fix up the cache-mode here if it's not * correct. */ l2b = pmap_get_l2_bucket(kernel_pmap, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) { /* * Page tables must have the cache-mode set to * Write-Thru. */ *ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; PTE_SYNC(ptep); cpu_tlb_flushD_SE(va); cpu_cpwait(); } #endif memset(mem, 0, L2_TABLE_SIZE_REAL); PTE_SYNC_RANGE(mem, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); return (0); } /* * A bunch of routines to conditionally flush the caches/TLB depending * on whether the specified pmap actually needs to be flushed at any * given time. */ static PMAP_INLINE void pmap_tlb_flushID_SE(pmap_t pm, vm_offset_t va) { if (pmap_is_current(pm)) cpu_tlb_flushID_SE(va); } static PMAP_INLINE void pmap_tlb_flushD_SE(pmap_t pm, vm_offset_t va) { if (pmap_is_current(pm)) cpu_tlb_flushD_SE(va); } static PMAP_INLINE void pmap_tlb_flushID(pmap_t pm) { if (pmap_is_current(pm)) cpu_tlb_flushID(); } static PMAP_INLINE void pmap_tlb_flushD(pmap_t pm) { if (pmap_is_current(pm)) cpu_tlb_flushD(); } static int pmap_has_valid_mapping(pmap_t pm, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *ptep; if (pmap_get_pde_pte(pm, va, &pde, &ptep) && ptep && ((*ptep & L2_TYPE_MASK) != L2_TYPE_INV)) return (1); return (0); } static PMAP_INLINE void pmap_idcache_wbinv_range(pmap_t pm, vm_offset_t va, vm_size_t len) { vm_size_t rest; CTR4(KTR_PMAP, "pmap_dcache_wbinv_range: pmap %p is_kernel %d va 0x%08x" " len 0x%x ", pm, pm == kernel_pmap, va, len); if (pmap_is_current(pm) || pm == kernel_pmap) { rest = MIN(PAGE_SIZE - (va & PAGE_MASK), len); while (len > 0) { if (pmap_has_valid_mapping(pm, va)) { cpu_idcache_wbinv_range(va, rest); cpu_l2cache_wbinv_range(va, rest); } len -= rest; va += rest; rest = MIN(PAGE_SIZE, len); } } } static PMAP_INLINE void pmap_dcache_wb_range(pmap_t pm, vm_offset_t va, vm_size_t len, boolean_t do_inv, boolean_t rd_only) { vm_size_t rest; CTR4(KTR_PMAP, "pmap_dcache_wb_range: pmap %p is_kernel %d va 0x%08x " "len 0x%x ", pm, pm == kernel_pmap, va, len); CTR2(KTR_PMAP, " do_inv %d rd_only %d", do_inv, rd_only); if (pmap_is_current(pm)) { rest = MIN(PAGE_SIZE - (va & PAGE_MASK), len); while (len > 0) { if (pmap_has_valid_mapping(pm, va)) { if (do_inv && rd_only) { cpu_dcache_inv_range(va, rest); cpu_l2cache_inv_range(va, rest); } else if (do_inv) { cpu_dcache_wbinv_range(va, rest); cpu_l2cache_wbinv_range(va, rest); } else if (!rd_only) { cpu_dcache_wb_range(va, rest); cpu_l2cache_wb_range(va, rest); } } len -= rest; va += rest; rest = MIN(PAGE_SIZE, len); } } } static PMAP_INLINE void pmap_idcache_wbinv_all(pmap_t pm) { if (pmap_is_current(pm)) { cpu_idcache_wbinv_all(); cpu_l2cache_wbinv_all(); } } #ifdef notyet static PMAP_INLINE void pmap_dcache_wbinv_all(pmap_t pm) { if (pmap_is_current(pm)) { cpu_dcache_wbinv_all(); cpu_l2cache_wbinv_all(); } } #endif /* * PTE_SYNC_CURRENT: * * Make sure the pte is written out to RAM. * We need to do this for one of two cases: * - We're dealing with the kernel pmap * - There is no pmap active in the cache/tlb. * - The specified pmap is 'active' in the cache/tlb. */ #ifdef PMAP_INCLUDE_PTE_SYNC #define PTE_SYNC_CURRENT(pm, ptep) \ do { \ if (PMAP_NEEDS_PTE_SYNC && \ pmap_is_current(pm)) \ PTE_SYNC(ptep); \ } while (/*CONSTCOND*/0) #else #define PTE_SYNC_CURRENT(pm, ptep) /* nothing */ #endif /* * cacheable == -1 means we must make the entry uncacheable, 1 means * cacheable; */ static __inline void pmap_set_cache_entry(pv_entry_t pv, pmap_t pm, vm_offset_t va, int cacheable) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; if (cacheable == 1) { pte = (*ptep & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode; if (l2pte_valid(pte)) { if (PV_BEEN_EXECD(pv->pv_flags)) { pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va); } else if (PV_BEEN_REFD(pv->pv_flags)) { pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va); } } } else { pte = *ptep &~ L2_S_CACHE_MASK; if ((va != pv->pv_va || pm != pv->pv_pmap) && l2pte_valid(pte)) { if (PV_BEEN_EXECD(pv->pv_flags)) { pmap_idcache_wbinv_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE); pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va); } else if (PV_BEEN_REFD(pv->pv_flags)) { pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, (pv->pv_flags & PVF_WRITE) == 0); pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va); } } } *ptep = pte; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); } static void pmap_fix_cache(struct vm_page *pg, pmap_t pm, vm_offset_t va) { int pmwc = 0; int writable = 0, kwritable = 0, uwritable = 0; int entries = 0, kentries = 0, uentries = 0; struct pv_entry *pv; rw_assert(&pvh_global_lock, RA_WLOCKED); /* the cache gets written back/invalidated on context switch. * therefore, if a user page shares an entry in the same page or * with the kernel map and at least one is writable, then the * cache entry must be set write-through. */ TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { /* generate a count of the pv_entry uses */ if (pv->pv_flags & PVF_WRITE) { if (pv->pv_pmap == kernel_pmap) kwritable++; else if (pv->pv_pmap == pm) uwritable++; writable++; } if (pv->pv_pmap == kernel_pmap) kentries++; else { if (pv->pv_pmap == pm) uentries++; entries++; } } /* * check if the user duplicate mapping has * been removed. */ if ((pm != kernel_pmap) && (((uentries > 1) && uwritable) || (uwritable > 1))) pmwc = 1; TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { /* check for user uncachable conditions - order is important */ if (pm != kernel_pmap && (pv->pv_pmap == pm || pv->pv_pmap == kernel_pmap)) { if ((uentries > 1 && uwritable) || uwritable > 1) { /* user duplicate mapping */ if (pv->pv_pmap != kernel_pmap) pv->pv_flags |= PVF_MWC; if (!(pv->pv_flags & PVF_NC)) { pv->pv_flags |= PVF_NC; pmap_set_cache_entry(pv, pm, va, -1); } continue; } else /* no longer a duplicate user */ pv->pv_flags &= ~PVF_MWC; } /* * check for kernel uncachable conditions * kernel writable or kernel readable with writable user entry */ if ((kwritable && (entries || kentries > 1)) || (kwritable > 1) || ((kwritable != writable) && kentries && (pv->pv_pmap == kernel_pmap || (pv->pv_flags & PVF_WRITE) || (pv->pv_flags & PVF_MWC)))) { if (!(pv->pv_flags & PVF_NC)) { pv->pv_flags |= PVF_NC; pmap_set_cache_entry(pv, pm, va, -1); } continue; } /* kernel and user are cachable */ if ((pm == kernel_pmap) && !(pv->pv_flags & PVF_MWC) && (pv->pv_flags & PVF_NC)) { pv->pv_flags &= ~PVF_NC; if (pg->md.pv_memattr != VM_MEMATTR_UNCACHEABLE) pmap_set_cache_entry(pv, pm, va, 1); continue; } /* user is no longer sharable and writable */ if (pm != kernel_pmap && (pv->pv_pmap == pm || pv->pv_pmap == kernel_pmap) && !pmwc && (pv->pv_flags & PVF_NC)) { pv->pv_flags &= ~(PVF_NC | PVF_MWC); if (pg->md.pv_memattr != VM_MEMATTR_UNCACHEABLE) pmap_set_cache_entry(pv, pm, va, 1); } } if ((kwritable == 0) && (writable == 0)) { pg->md.pvh_attrs &= ~PVF_MOD; vm_page_aflag_clear(pg, PGA_WRITEABLE); return; } } /* * Modify pte bits for all ptes corresponding to the given physical address. * We use `maskbits' rather than `clearbits' because we're always passing * constants and the latter would require an extra inversion at run-time. */ static int pmap_clearbit(struct vm_page *pg, u_int maskbits) { struct l2_bucket *l2b; struct pv_entry *pv; pt_entry_t *ptep, npte, opte; pmap_t pm; vm_offset_t va; u_int oflags; int count = 0; rw_wlock(&pvh_global_lock); if (maskbits & PVF_WRITE) maskbits |= PVF_MOD; /* * Clear saved attributes (modify, reference) */ pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF)); if (TAILQ_EMPTY(&pg->md.pv_list)) { rw_wunlock(&pvh_global_lock); return (0); } /* * Loop over all current mappings setting/clearing as appropos */ TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { va = pv->pv_va; pm = pv->pv_pmap; oflags = pv->pv_flags; if (!(oflags & maskbits)) { if ((maskbits & PVF_WRITE) && (pv->pv_flags & PVF_NC)) { if (pg->md.pv_memattr != VM_MEMATTR_UNCACHEABLE) { PMAP_LOCK(pm); l2b = pmap_get_l2_bucket(pm, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; *ptep |= pte_l2_s_cache_mode; PTE_SYNC(ptep); PMAP_UNLOCK(pm); } pv->pv_flags &= ~(PVF_NC | PVF_MWC); } continue; } pv->pv_flags &= ~maskbits; PMAP_LOCK(pm); l2b = pmap_get_l2_bucket(pm, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; npte = opte = *ptep; if (maskbits & (PVF_WRITE|PVF_MOD)) { if ((pv->pv_flags & PVF_NC)) { /* * Entry is not cacheable: * * Don't turn caching on again if this is a * modified emulation. This would be * inconsistent with the settings created by * pmap_fix_cache(). Otherwise, it's safe * to re-enable caching. * * There's no need to call pmap_fix_cache() * here: all pages are losing their write * permission. */ if (maskbits & PVF_WRITE) { if (pg->md.pv_memattr != VM_MEMATTR_UNCACHEABLE) npte |= pte_l2_s_cache_mode; pv->pv_flags &= ~(PVF_NC | PVF_MWC); } } else if (opte & L2_S_PROT_W) { vm_page_dirty(pg); /* * Entry is writable/cacheable: check if pmap * is current if it is flush it, otherwise it * won't be in the cache */ if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, (maskbits & PVF_REF) ? TRUE : FALSE, FALSE); } /* make the pte read only */ npte &= ~L2_S_PROT_W; } if (maskbits & PVF_REF) { if ((pv->pv_flags & PVF_NC) == 0 && (maskbits & (PVF_WRITE|PVF_MOD)) == 0) { /* * Check npte here; we may have already * done the wbinv above, and the validity * of the PTE is the same for opte and * npte. */ if (npte & L2_S_PROT_W) { if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, TRUE, FALSE); } else if ((npte & L2_TYPE_MASK) != L2_TYPE_INV) { /* XXXJRT need idcache_inv_range */ if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, TRUE, TRUE); } } /* * Make the PTE invalid so that we will take a * page fault the next time the mapping is * referenced. */ npte &= ~L2_TYPE_MASK; npte |= L2_TYPE_INV; } if (npte != opte) { count++; *ptep = npte; PTE_SYNC(ptep); /* Flush the TLB entry if a current pmap. */ if (PV_BEEN_EXECD(oflags)) pmap_tlb_flushID_SE(pm, pv->pv_va); else if (PV_BEEN_REFD(oflags)) pmap_tlb_flushD_SE(pm, pv->pv_va); } PMAP_UNLOCK(pm); } if (maskbits & PVF_WRITE) vm_page_aflag_clear(pg, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); return (count); } /* * main pv_entry manipulation functions: * pmap_enter_pv: enter a mapping onto a vm_page list * pmap_remove_pv: remove a mappiing from a vm_page list * * NOTE: pmap_enter_pv expects to lock the pvh itself * pmap_remove_pv expects the caller to lock the pvh before calling */ /* * pmap_enter_pv: enter a mapping onto a vm_page's PV list * * => caller should hold the proper lock on pvh_global_lock * => caller should have pmap locked * => we will (someday) gain the lock on the vm_page's PV list * => caller should adjust ptp's wire_count before calling * => caller should not adjust pmap's wire_count */ static void pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm, vm_offset_t va, u_int flags) { rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_ASSERT_LOCKED(pm); if (pg->md.pv_kva != 0) { pve->pv_pmap = kernel_pmap; pve->pv_va = pg->md.pv_kva; pve->pv_flags = PVF_WRITE | PVF_UNMAN; if (pm != kernel_pmap) PMAP_LOCK(kernel_pmap); TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list); TAILQ_INSERT_HEAD(&kernel_pmap->pm_pvlist, pve, pv_plist); if (pm != kernel_pmap) PMAP_UNLOCK(kernel_pmap); pg->md.pv_kva = 0; if ((pve = pmap_get_pv_entry()) == NULL) panic("pmap_kenter_pv: no pv entries"); } pve->pv_pmap = pm; pve->pv_va = va; pve->pv_flags = flags; TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list); TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist); pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD); if (pve->pv_flags & PVF_WIRED) ++pm->pm_stats.wired_count; vm_page_aflag_set(pg, PGA_REFERENCED); } /* * * pmap_find_pv: Find a pv entry * * => caller should hold lock on vm_page */ static PMAP_INLINE struct pv_entry * pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va) { struct pv_entry *pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) if (pm == pv->pv_pmap && va == pv->pv_va) break; return (pv); } /* * vector_page_setprot: * * Manipulate the protection of the vector page. */ void vector_page_setprot(int prot) { struct l2_bucket *l2b; pt_entry_t *ptep; l2b = pmap_get_l2_bucket(kernel_pmap, vector_page); ptep = &l2b->l2b_kva[l2pte_index(vector_page)]; *ptep = (*ptep & ~L1_S_PROT_MASK) | L2_S_PROT(PTE_KERNEL, prot); PTE_SYNC(ptep); cpu_tlb_flushD_SE(vector_page); cpu_cpwait(); } /* * pmap_remove_pv: try to remove a mapping from a pv_list * * => caller should hold proper lock on pmap_main_lock * => pmap should be locked * => caller should hold lock on vm_page [so that attrs can be adjusted] * => caller should adjust ptp's wire_count and free PTP if needed * => caller should NOT adjust pmap's wire_count * => we return the removed pve */ static void pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve) { struct pv_entry *pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_ASSERT_LOCKED(pm); TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list); TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist); if (pve->pv_flags & PVF_WIRED) --pm->pm_stats.wired_count; if (pg->md.pvh_attrs & PVF_MOD) vm_page_dirty(pg); if (TAILQ_FIRST(&pg->md.pv_list) == NULL) pg->md.pvh_attrs &= ~PVF_REF; else vm_page_aflag_set(pg, PGA_REFERENCED); if ((pve->pv_flags & PVF_NC) && ((pm == kernel_pmap) || (pve->pv_flags & PVF_WRITE) || !(pve->pv_flags & PVF_MWC))) pmap_fix_cache(pg, pm, 0); else if (pve->pv_flags & PVF_WRITE) { TAILQ_FOREACH(pve, &pg->md.pv_list, pv_list) if (pve->pv_flags & PVF_WRITE) break; if (!pve) { pg->md.pvh_attrs &= ~PVF_MOD; vm_page_aflag_clear(pg, PGA_WRITEABLE); } } pv = TAILQ_FIRST(&pg->md.pv_list); if (pv != NULL && (pv->pv_flags & PVF_UNMAN) && TAILQ_NEXT(pv, pv_list) == NULL) { pm = kernel_pmap; pg->md.pv_kva = pv->pv_va; /* a recursive pmap_nuke_pv */ TAILQ_REMOVE(&pg->md.pv_list, pv, pv_list); TAILQ_REMOVE(&pm->pm_pvlist, pv, pv_plist); if (pv->pv_flags & PVF_WIRED) --pm->pm_stats.wired_count; pg->md.pvh_attrs &= ~PVF_REF; pg->md.pvh_attrs &= ~PVF_MOD; vm_page_aflag_clear(pg, PGA_WRITEABLE); pmap_free_pv_entry(pv); } } static struct pv_entry * pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va) { struct pv_entry *pve; rw_assert(&pvh_global_lock, RA_WLOCKED); pve = TAILQ_FIRST(&pg->md.pv_list); while (pve) { if (pve->pv_pmap == pm && pve->pv_va == va) { /* match? */ pmap_nuke_pv(pg, pm, pve); break; } pve = TAILQ_NEXT(pve, pv_list); } if (pve == NULL && pg->md.pv_kva == va) pg->md.pv_kva = 0; return(pve); /* return removed pve */ } /* * * pmap_modify_pv: Update pv flags * * => caller should hold lock on vm_page [so that attrs can be adjusted] * => caller should NOT adjust pmap's wire_count * => we return the old flags * * Modify a physical-virtual mapping in the pv table */ static u_int pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va, u_int clr_mask, u_int set_mask) { struct pv_entry *npv; u_int flags, oflags; PMAP_ASSERT_LOCKED(pm); rw_assert(&pvh_global_lock, RA_WLOCKED); if ((npv = pmap_find_pv(pg, pm, va)) == NULL) return (0); /* * There is at least one VA mapping this page. */ if (clr_mask & (PVF_REF | PVF_MOD)) pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD); oflags = npv->pv_flags; npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask; if ((flags ^ oflags) & PVF_WIRED) { if (flags & PVF_WIRED) ++pm->pm_stats.wired_count; else --pm->pm_stats.wired_count; } if ((flags ^ oflags) & PVF_WRITE) pmap_fix_cache(pg, pm, 0); return (oflags); } /* Function to set the debug level of the pmap code */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ void pmap_pinit0(struct pmap *pmap) { PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap)); bcopy(kernel_pmap, pmap, sizeof(*pmap)); bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx)); PMAP_LOCK_INIT(pmap); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_DEFAULT; m->md.pvh_attrs = 0; m->md.pv_kva = 0; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { int shpgperproc = PMAP_SHPGPERPROC; l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); /* * Initialize the PV entry allocator. */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; uma_zone_reserve_kva(pvzone, pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * Now it is safe to enable pv_table recording. */ PDEBUG(1, printf("pmap_init: done!\n")); } int pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user) { struct l2_dtable *l2; struct l2_bucket *l2b; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; int rv = 0; l1idx = L1_IDX(va); rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); /* * If there is no l2_dtable for this address, then the process * has no business accessing it. * * Note: This will catch userland processes trying to access * kernel addresses. */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL) goto out; /* * Likewise if there is no L2 descriptor table */ l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; if (l2b->l2b_kva == NULL) goto out; /* * Check the PTE itself. */ ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; if (pte == 0) goto out; /* * Catch a userland access to the vector page mapped at 0x0 */ if (user && (pte & L2_S_PROT_U) == 0) goto out; if (va == vector_page) goto out; pa = l2pte_pa(pte); if ((ftype & VM_PROT_WRITE) && (pte & L2_S_PROT_W) == 0) { /* * This looks like a good candidate for "page modified" * emulation... */ struct pv_entry *pv; struct vm_page *pg; /* Extract the physical address of the page */ if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) { goto out; } /* Get the current flags for this page. */ pv = pmap_find_pv(pg, pm, va); if (pv == NULL) { goto out; } /* * Do the flags say this page is writable? If not then it * is a genuine write fault. If yes then the write fault is * our fault as we did not reflect the write access in the * PTE. Now we know a write has occurred we can correct this * and also set the modified bit */ if ((pv->pv_flags & PVF_WRITE) == 0) { goto out; } pg->md.pvh_attrs |= PVF_REF | PVF_MOD; vm_page_dirty(pg); pv->pv_flags |= PVF_REF | PVF_MOD; /* * Re-enable write permissions for the page. No need to call * pmap_fix_cache(), since this is just a * modified-emulation fault, and the PVF_WRITE bit isn't * changing. We've already set the cacheable bits based on * the assumption that we can write to this page. */ *ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO | L2_S_PROT_W; PTE_SYNC(ptep); rv = 1; } else if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) { /* * This looks like a good candidate for "page referenced" * emulation. */ struct pv_entry *pv; struct vm_page *pg; /* Extract the physical address of the page */ if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) goto out; /* Get the current flags for this page. */ pv = pmap_find_pv(pg, pm, va); if (pv == NULL) goto out; pg->md.pvh_attrs |= PVF_REF; pv->pv_flags |= PVF_REF; *ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO; PTE_SYNC(ptep); rv = 1; } /* * We know there is a valid mapping here, so simply * fix up the L1 if necessary. */ pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO; if (*pl1pd != l1pd) { *pl1pd = l1pd; PTE_SYNC(pl1pd); rv = 1; } #ifdef DEBUG /* * If 'rv == 0' at this point, it generally indicates that there is a * stale TLB entry for the faulting address. This happens when two or * more processes are sharing an L1. Since we don't flush the TLB on * a context switch between such processes, we can take domain faults * for mappings which exist at the same VA in both processes. EVEN IF * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for * example. * * This is extremely likely to happen if pmap_enter() updated the L1 * entry for a recently entered mapping. In this case, the TLB is * flushed for the new mapping, but there may still be TLB entries for * other mappings belonging to other processes in the 1MB range * covered by the L1 entry. * * Since 'rv == 0', we know that the L1 already contains the correct * value, so the fault must be due to a stale TLB entry. * * Since we always need to flush the TLB anyway in the case where we * fixed up the L1, or frobbed the L2 PTE, we effectively deal with * stale TLB entries dynamically. * * However, the above condition can ONLY happen if the current L1 is * being shared. If it happens when the L1 is unshared, it indicates * that other parts of the pmap are not doing their job WRT managing * the TLB. */ if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) { printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n", pm, (u_long)va, ftype); printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n", l2, l2b, ptep, pl1pd); printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n", pte, l1pd, last_fault_code); #ifdef DDB Debugger(); #endif } #endif cpu_tlb_flushID_SE(va); cpu_cpwait(); rv = 1; out: rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); return (rv); } void pmap_postinit(void) { struct l2_bucket *l2b; struct l1_ttable *l1; pd_entry_t *pl1pt; pt_entry_t *ptep, pte; vm_offset_t va, eva; u_int loop, needed; needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0); needed -= 1; l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK); for (loop = 0; loop < needed; loop++, l1++) { /* Allocate a L1 page table */ va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0, 0xffffffff, L1_TABLE_SIZE, 0); if (va == 0) panic("Cannot allocate L1 KVM"); eva = va + L1_TABLE_SIZE; pl1pt = (pd_entry_t *)va; while (va < eva) { l2b = pmap_get_l2_bucket(kernel_pmap, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; *ptep = pte; PTE_SYNC(ptep); cpu_tlb_flushD_SE(va); va += PAGE_SIZE; } pmap_init_l1(l1, pl1pt); } #ifdef DEBUG printf("pmap_postinit: Allocated %d static L1 descriptor tables\n", needed); #endif } /* * This is used to stuff certain critical values into the PCB where they * can be accessed quickly from cpu_switch() et al. */ void pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb) { struct l2_bucket *l2b; pcb->pcb_pagedir = pm->pm_l1->l1_physaddr; pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) | (DOMAIN_CLIENT << (pm->pm_domain * 2)); if (vector_page < KERNBASE) { pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)]; l2b = pmap_get_l2_bucket(pm, vector_page); pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO | L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL); } else pcb->pcb_pl1vec = NULL; } void pmap_activate(struct thread *td) { pmap_t pm; struct pcb *pcb; pm = vmspace_pmap(td->td_proc->p_vmspace); pcb = td->td_pcb; critical_enter(); pmap_set_pcb_pagedir(pm, pcb); if (td == curthread) { u_int cur_dacr, cur_ttb; __asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb)); __asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr)); cur_ttb &= ~(L1_TABLE_SIZE - 1); if (cur_ttb == (u_int)pcb->pcb_pagedir && cur_dacr == pcb->pcb_dacr) { /* * No need to switch address spaces. */ critical_exit(); return; } /* * We MUST, I repeat, MUST fix up the L1 entry corresponding * to 'vector_page' in the incoming L1 table before switching * to it otherwise subsequent interrupts/exceptions (including * domain faults!) will jump into hyperspace. */ if (pcb->pcb_pl1vec) { *pcb->pcb_pl1vec = pcb->pcb_l1vec; /* * Don't need to PTE_SYNC() at this point since * cpu_setttb() is about to flush both the cache * and the TLB. */ } cpu_domains(pcb->pcb_dacr); cpu_setttb(pcb->pcb_pagedir); } critical_exit(); } static int pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va) { pd_entry_t *pdep, pde; pt_entry_t *ptep, pte; vm_offset_t pa; int rv = 0; /* * Make sure the descriptor itself has the correct cache mode */ pdep = &kl1[L1_IDX(va)]; pde = *pdep; if (l1pte_section_p(pde)) { if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) { *pdep = (pde & ~L1_S_CACHE_MASK) | pte_l1_s_cache_mode_pt; PTE_SYNC(pdep); cpu_dcache_wbinv_range((vm_offset_t)pdep, sizeof(*pdep)); cpu_l2cache_wbinv_range((vm_offset_t)pdep, sizeof(*pdep)); rv = 1; } } else { pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK); ptep = (pt_entry_t *)kernel_pt_lookup(pa); if (ptep == NULL) panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep); ptep = &ptep[l2pte_index(va)]; pte = *ptep; if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) { *ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; PTE_SYNC(ptep); cpu_dcache_wbinv_range((vm_offset_t)ptep, sizeof(*ptep)); cpu_l2cache_wbinv_range((vm_offset_t)ptep, sizeof(*ptep)); rv = 1; } } return (rv); } static void pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap, pt_entry_t **ptep) { vm_offset_t va = *availp; struct l2_bucket *l2b; if (ptep) { l2b = pmap_get_l2_bucket(kernel_pmap, va); if (l2b == NULL) panic("pmap_alloc_specials: no l2b for 0x%x", va); *ptep = &l2b->l2b_kva[l2pte_index(va)]; } *vap = va; *availp = va + (PAGE_SIZE * pages); } /* * Bootstrap the system enough to run with virtual memory. * * On the arm this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define PMAP_STATIC_L2_SIZE 16 void pmap_bootstrap(vm_offset_t firstaddr, struct pv_addr *l1pt) { static struct l1_ttable static_l1; static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE]; struct l1_ttable *l1 = &static_l1; struct l2_dtable *l2; struct l2_bucket *l2b; pd_entry_t pde; pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va; pt_entry_t *ptep; pt_entry_t *qmap_pte; vm_paddr_t pa; vm_offset_t va; vm_size_t size; int l1idx, l2idx, l2next = 0; PDEBUG(1, printf("firstaddr = %08x, lastaddr = %08x\n", firstaddr, vm_max_kernel_address)); virtual_avail = firstaddr; kernel_pmap->pm_l1 = l1; kernel_l1pa = l1pt->pv_pa; /* * Scan the L1 translation table created by initarm() and create * the required metadata for all valid mappings found in it. */ for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) { pde = kernel_l1pt[l1idx]; /* * We're only interested in Coarse mappings. * pmap_extract() can deal with section mappings without * recourse to checking L2 metadata. */ if ((pde & L1_TYPE_MASK) != L1_TYPE_C) continue; /* * Lookup the KVA of this L2 descriptor table */ pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK); ptep = (pt_entry_t *)kernel_pt_lookup(pa); if (ptep == NULL) { panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx", (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa); } /* * Fetch the associated L2 metadata structure. * Allocate a new one if necessary. */ if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) { if (l2next == PMAP_STATIC_L2_SIZE) panic("pmap_bootstrap: out of static L2s"); kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 = &static_l2[l2next++]; } /* * One more L1 slot tracked... */ l2->l2_occupancy++; /* * Fill in the details of the L2 descriptor in the * appropriate bucket. */ l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; l2b->l2b_kva = ptep; l2b->l2b_phys = pa; l2b->l2b_l1idx = l1idx; /* * Establish an initial occupancy count for this descriptor */ for (l2idx = 0; l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); l2idx++) { if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) { l2b->l2b_occupancy++; } } /* * Make sure the descriptor itself has the correct cache mode. * If not, fix it, but whine about the problem. Port-meisters * should consider this a clue to fix up their initarm() * function. :) */ if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) { printf("pmap_bootstrap: WARNING! wrong cache mode for " "L2 pte @ %p\n", ptep); } } /* * Ensure the primary (kernel) L1 has the correct cache mode for * a page table. Bitch if it is not correctly set. */ for (va = (vm_offset_t)kernel_l1pt; va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) { if (pmap_set_pt_cache_mode(kernel_l1pt, va)) printf("pmap_bootstrap: WARNING! wrong cache mode for " "primary L1 @ 0x%x\n", va); } cpu_dcache_wbinv_all(); cpu_l2cache_wbinv_all(); cpu_tlb_flushID(); cpu_cpwait(); PMAP_LOCK_INIT(kernel_pmap); CPU_FILL(&kernel_pmap->pm_active); kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL; TAILQ_INIT(&kernel_pmap->pm_pvlist); /* * Initialize the global pv list lock. */ rw_init_flags(&pvh_global_lock, "pmap pv global", RW_RECURSE); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte); pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte); pmap_alloc_specials(&virtual_avail, 1, &qmap_addr, &qmap_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)qmap_pte); size = ((vm_max_kernel_address - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE; pmap_alloc_specials(&virtual_avail, round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE, &pmap_kernel_l2ptp_kva, NULL); size = howmany(size, L2_BUCKET_SIZE); pmap_alloc_specials(&virtual_avail, round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE, &pmap_kernel_l2dtable_kva, NULL); pmap_alloc_specials(&virtual_avail, 1, (vm_offset_t*)&_tmppt, NULL); pmap_alloc_specials(&virtual_avail, MAXDUMPPGS, (vm_offset_t *)&crashdumpmap, NULL); SLIST_INIT(&l1_list); TAILQ_INIT(&l1_lru_list); mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF); pmap_init_l1(l1, kernel_l1pt); cpu_dcache_wbinv_all(); cpu_l2cache_wbinv_all(); virtual_avail = round_page(virtual_avail); virtual_end = vm_max_kernel_address; kernel_vm_end = pmap_curmaxkvaddr; mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF); mtx_init(&qmap_mtx, "quick mapping mtx", NULL, MTX_DEF); pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { struct pcb *pcb; pmap_idcache_wbinv_all(pmap); cpu_l2cache_wbinv_all(); pmap_tlb_flushID(pmap); cpu_cpwait(); if (vector_page < KERNBASE) { struct pcb *curpcb = PCPU_GET(curpcb); pcb = thread0.td_pcb; if (pmap_is_current(pmap)) { /* * Frob the L1 entry corresponding to the vector * page so that it contains the kernel pmap's domain * number. This will ensure pmap_remove() does not * pull the current vector page out from under us. */ critical_enter(); *pcb->pcb_pl1vec = pcb->pcb_l1vec; cpu_domains(pcb->pcb_dacr); cpu_setttb(pcb->pcb_pagedir); critical_exit(); } pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE); /* * Make sure cpu_switch(), et al, DTRT. This is safe to do * since this process has no remaining mappings of its own. */ curpcb->pcb_pl1vec = pcb->pcb_pl1vec; curpcb->pcb_l1vec = pcb->pcb_l1vec; curpcb->pcb_dacr = pcb->pcb_dacr; curpcb->pcb_pagedir = pcb->pcb_pagedir; } pmap_free_l1(pmap); dprintf("pmap_release()\n"); } /* * Helper function for pmap_grow_l2_bucket() */ static __inline int pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap) { struct l2_bucket *l2b; pt_entry_t *ptep; vm_paddr_t pa; struct vm_page *pg; pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (pg == NULL) return (1); pa = VM_PAGE_TO_PHYS(pg); if (pap) *pap = pa; l2b = pmap_get_l2_bucket(kernel_pmap, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; *ptep = L2_S_PROTO | pa | cache_mode | L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE); PTE_SYNC(ptep); return (0); } /* * This is the same as pmap_alloc_l2_bucket(), except that it is only * used by pmap_growkernel(). */ static __inline struct l2_bucket * pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; struct l1_ttable *l1; pd_entry_t *pl1pd; u_short l1idx; vm_offset_t nva; l1idx = L1_IDX(va); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) { /* * No mapping at this address, as there is * no entry in the L1 table. * Need to allocate a new l2_dtable. */ nva = pmap_kernel_l2dtable_kva; if ((nva & PAGE_MASK) == 0) { /* * Need to allocate a backing page */ if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL)) return (NULL); } l2 = (struct l2_dtable *)nva; nva += sizeof(struct l2_dtable); if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva & PAGE_MASK)) { /* * The new l2_dtable straddles a page boundary. * Map in another page to cover it. */ if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL)) return (NULL); } pmap_kernel_l2dtable_kva = nva; /* * Link it into the parent pmap */ pm->pm_l2[L2_IDX(l1idx)] = l2; memset(l2, 0, sizeof(*l2)); } l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; /* * Fetch pointer to the L2 page table associated with the address. */ if (l2b->l2b_kva == NULL) { pt_entry_t *ptep; /* * No L2 page table has been allocated. Chances are, this * is because we just allocated the l2_dtable, above. */ nva = pmap_kernel_l2ptp_kva; ptep = (pt_entry_t *)nva; if ((nva & PAGE_MASK) == 0) { /* * Need to allocate a backing page */ if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt, &pmap_kernel_l2ptp_phys)) return (NULL); PTE_SYNC_RANGE(ptep, PAGE_SIZE / sizeof(pt_entry_t)); } memset(ptep, 0, L2_TABLE_SIZE_REAL); l2->l2_occupancy++; l2b->l2b_kva = ptep; l2b->l2b_l1idx = l1idx; l2b->l2b_phys = pmap_kernel_l2ptp_phys; pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL; pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL; } /* Distribute new L1 entry to all other L1s */ SLIST_FOREACH(l1, &l1_list, l1_link) { pl1pd = &l1->l1_kva[L1_IDX(va)]; *pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO; PTE_SYNC(pl1pd); } return (l2b); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { pmap_t kpm = kernel_pmap; if (addr <= pmap_curmaxkvaddr) return; /* we are OK */ /* * whoops! we need to add kernel PTPs */ /* Map 1MB at a time */ for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE) pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr); /* * flush out the cache, expensive but growkernel will happen so * rarely */ cpu_dcache_wbinv_all(); cpu_l2cache_wbinv_all(); cpu_tlb_flushD(); cpu_cpwait(); kernel_vm_end = pmap_curmaxkvaddr; } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { struct pv_entry *pv, *npv; struct l2_bucket *l2b = NULL; vm_page_t m; pt_entry_t *pt; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); cpu_idcache_wbinv_all(); cpu_l2cache_wbinv_all(); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_flags & PVF_WIRED || pv->pv_flags & PVF_UNMAN) { /* Cannot remove wired or unmanaged pages now. */ npv = TAILQ_NEXT(pv, pv_plist); continue; } pmap->pm_stats.resident_count--; l2b = pmap_get_l2_bucket(pmap, pv->pv_va); KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages")); pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; m = PHYS_TO_VM_PAGE(*pt & L2_S_FRAME); KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt)); *pt = 0; PTE_SYNC(pt); npv = TAILQ_NEXT(pv, pv_plist); pmap_nuke_pv(m, pmap, pv); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_free_pv_entry(pv); pmap_free_l2_bucket(pmap, l2b, 1); } rw_wunlock(&pvh_global_lock); cpu_tlb_flushID(); cpu_cpwait(); PMAP_UNLOCK(pmap); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* Map a section into the KVA. */ /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); return ((void *)crashdumpmap); } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ static PMAP_INLINE void pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags) { struct l2_bucket *l2b; pt_entry_t *pte; pt_entry_t opte; struct pv_entry *pve; vm_page_t m; PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n", (uint32_t) va, (uint32_t) pa)); l2b = pmap_get_l2_bucket(kernel_pmap, va); if (l2b == NULL) l2b = pmap_grow_l2_bucket(kernel_pmap, va); KASSERT(l2b != NULL, ("No L2 Bucket")); pte = &l2b->l2b_kva[l2pte_index(va)]; opte = *pte; PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n", (uint32_t) pte, opte, *pte)); if (l2pte_valid(opte)) { pmap_kremove(va); } else { if (opte == 0) l2b->l2b_occupancy++; } *pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE); if (flags & KENTER_CACHE) *pte |= pte_l2_s_cache_mode; if (flags & KENTER_USER) *pte |= L2_S_PROT_U; PTE_SYNC(pte); /* * A kernel mapping may not be the page's only mapping, so create a PV * entry to ensure proper caching. * * The existence test for the pvzone is used to delay the recording of * kernel mappings until the VM system is fully initialized. * * This expects the physical memory to have a vm_page_array entry. */ if (pvzone != NULL && (m = vm_phys_paddr_to_vm_page(pa)) != NULL) { rw_wlock(&pvh_global_lock); if (!TAILQ_EMPTY(&m->md.pv_list) || m->md.pv_kva != 0) { if ((pve = pmap_get_pv_entry()) == NULL) panic("pmap_kenter_internal: no pv entries"); PMAP_LOCK(kernel_pmap); pmap_enter_pv(m, pve, kernel_pmap, va, PVF_WRITE | PVF_UNMAN); pmap_fix_cache(m, kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } else { m->md.pv_kva = va; } rw_wunlock(&pvh_global_lock); } } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_internal(va, pa, KENTER_CACHE); } void pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_internal(va, pa, 0); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { vm_offset_t sva; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; while (size != 0) { pmap_kenter_internal(va, pa, 0); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { vm_offset_t sva; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; while (size != 0) { pmap_kremove(va); va += PAGE_SIZE; size -= PAGE_SIZE; } } void pmap_kenter_user(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER); /* * Call pmap_fault_fixup now, to make sure we'll have no exception * at the first use of the new address, or bad things will happen, * as we use one of these addresses in the exception handlers. */ pmap_fault_fixup(kernel_pmap, va, VM_PROT_READ|VM_PROT_WRITE, 1); } vm_paddr_t pmap_kextract(vm_offset_t va) { return (pmap_extract_locked(kernel_pmap, va)); } /* * remove a page from the kernel pagetables */ void pmap_kremove(vm_offset_t va) { struct l2_bucket *l2b; pt_entry_t *pte, opte; struct pv_entry *pve; vm_page_t m; vm_offset_t pa; l2b = pmap_get_l2_bucket(kernel_pmap, va); if (!l2b) return; KASSERT(l2b != NULL, ("No L2 Bucket")); pte = &l2b->l2b_kva[l2pte_index(va)]; opte = *pte; if (l2pte_valid(opte)) { /* pa = vtophs(va) taken from pmap_extract() */ if ((opte & L2_TYPE_MASK) == L2_TYPE_L) pa = (opte & L2_L_FRAME) | (va & L2_L_OFFSET); else pa = (opte & L2_S_FRAME) | (va & L2_S_OFFSET); /* note: should never have to remove an allocation * before the pvzone is initialized. */ rw_wlock(&pvh_global_lock); PMAP_LOCK(kernel_pmap); if (pvzone != NULL && (m = vm_phys_paddr_to_vm_page(pa)) && (pve = pmap_remove_pv(m, kernel_pmap, va))) pmap_free_pv_entry(pve); PMAP_UNLOCK(kernel_pmap); rw_wunlock(&pvh_global_lock); va = va & ~PAGE_MASK; cpu_dcache_wbinv_range(va, PAGE_SIZE); cpu_l2cache_wbinv_range(va, PAGE_SIZE); cpu_tlb_flushD_SE(va); cpu_cpwait(); *pte = 0; } } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { vm_offset_t sva = *virt; vm_offset_t va = sva; PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, " "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end, prot)); while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } static void pmap_wb_page(vm_page_t m) { struct pv_entry *pv; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, FALSE, (pv->pv_flags & PVF_WRITE) == 0); } static void pmap_inv_page(vm_page_t m) { struct pv_entry *pv; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, TRUE); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; for (i = 0; i < count; i++) { pmap_wb_page(m[i]); pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]), KENTER_CACHE); va += PAGE_SIZE; } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { vm_paddr_t pa; int i; for (i = 0; i < count; i++) { pa = vtophys(va); if (pa) { pmap_inv_page(PHYS_TO_VM_PAGE(pa)); pmap_kremove(va); } va += PAGE_SIZE; } } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; if (!pmap_get_pde_pte(pmap, addr, &pde, &pte)) return (FALSE); KASSERT(pte != NULL, ("Valid mapping but no pte ?")); if (*pte == 0) return (TRUE); return (FALSE); } /* * Fetch pointers to the PDE/PTE for the given pmap/VA pair. * Returns TRUE if the mapping exists, else FALSE. * * NOTE: This function is only used by a couple of arm-specific modules. * It is not safe to take any pmap locks here, since we could be right * in the middle of debugging the pmap anyway... * * It is possible for this routine to return FALSE even though a valid * mapping does exist. This is because we don't lock, so the metadata * state may be inconsistent. * * NOTE: We can return a NULL *ptp in the case where the L1 pde is * a "section" mapping. */ boolean_t pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep; u_short l1idx; if (pm->pm_l1 == NULL) return (FALSE); l1idx = L1_IDX(va); *pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = *pl1pd; if (l1pte_section_p(l1pd)) { *ptp = NULL; return (TRUE); } if (pm->pm_l2 == NULL) return (FALSE); l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { return (FALSE); } *ptp = &ptep[l2pte_index(va)]; return (TRUE); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pt_entry_t *ptep; struct l2_bucket *l2b; boolean_t flush = FALSE; pmap_t curpm; int flags = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); if (TAILQ_EMPTY(&m->md.pv_list)) return; rw_wlock(&pvh_global_lock); /* * XXX This call shouldn't exist. Iterating over the PV list twice, * once in pmap_clearbit() and again below, is both unnecessary and * inefficient. The below code should itself write back the cache * entry before it destroys the mapping. */ pmap_clearbit(m, PVF_WRITE); curpm = vmspace_pmap(curproc->p_vmspace); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { if (flush == FALSE && (pv->pv_pmap == curpm || pv->pv_pmap == kernel_pmap)) flush = TRUE; PMAP_LOCK(pv->pv_pmap); /* * Cached contents were written-back in pmap_clearbit(), * but we still have to invalidate the cache entry to make * sure stale data are not retrieved when another page will be * mapped under this virtual address. */ if (pmap_is_current(pv->pv_pmap)) { cpu_dcache_inv_range(pv->pv_va, PAGE_SIZE); if (pmap_has_valid_mapping(pv->pv_pmap, pv->pv_va)) cpu_l2cache_inv_range(pv->pv_va, PAGE_SIZE); } if (pv->pv_flags & PVF_UNMAN) { /* remove the pv entry, but do not remove the mapping * and remember this is a kernel mapped page */ m->md.pv_kva = pv->pv_va; } else { /* remove the mapping and pv entry */ l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); KASSERT(l2b != NULL, ("No l2 bucket")); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; *ptep = 0; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); pmap_free_l2_bucket(pv->pv_pmap, l2b, 1); pv->pv_pmap->pm_stats.resident_count--; flags |= pv->pv_flags; } pmap_nuke_pv(m, pv->pv_pmap, pv); PMAP_UNLOCK(pv->pv_pmap); pmap_free_pv_entry(pv); } if (flush) { if (PV_BEEN_EXECD(flags)) pmap_tlb_flushID(curpm); else pmap_tlb_flushD(curpm); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_offset_t next_bucket; u_int flags; int flush; CTR4(KTR_PMAP, "pmap_protect: pmap %p sva 0x%08x eva 0x%08x prot %x", pm, sva, eva, prot); if ((prot & VM_PROT_READ) == 0) { pmap_remove(pm, sva, eva); return; } if (prot & VM_PROT_WRITE) { /* * If this is a read->write transition, just ignore it and let * vm_fault() take care of it later. */ return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); /* * OK, at this point, we know we're doing write-protect operation. * If the pmap is active, write-back the range. */ pmap_dcache_wb_range(pm, sva, eva - sva, FALSE, FALSE); flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1; flags = 0; while (sva < eva) { next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pm, sva); if (l2b == NULL) { sva = next_bucket; continue; } ptep = &l2b->l2b_kva[l2pte_index(sva)]; while (sva < next_bucket) { if ((pte = *ptep) != 0 && (pte & L2_S_PROT_W) != 0) { struct vm_page *pg; u_int f; pg = PHYS_TO_VM_PAGE(l2pte_pa(pte)); pte &= ~L2_S_PROT_W; *ptep = pte; PTE_SYNC(ptep); if (!(pg->oflags & VPO_UNMANAGED)) { f = pmap_modify_pv(pg, pm, sva, PVF_WRITE, 0); if (f & PVF_WRITE) vm_page_dirty(pg); } else f = 0; if (flush >= 0) { flush++; flags |= f; } else if (PV_BEEN_EXECD(f)) pmap_tlb_flushID_SE(pm, sva); else if (PV_BEEN_REFD(f)) pmap_tlb_flushD_SE(pm, sva); } sva += PAGE_SIZE; ptep++; } } if (flush) { if (PV_BEEN_EXECD(flags)) pmap_tlb_flushID(pm); else if (PV_BEEN_REFD(flags)) pmap_tlb_flushD(pm); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { int rv; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); rv = pmap_enter_locked(pmap, va, m, prot, flags); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * The pvh global and pmap locks must be held. */ static int pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags) { struct l2_bucket *l2b = NULL; struct vm_page *opg; struct pv_entry *pve = NULL; pt_entry_t *ptep, npte, opte; u_int nflags; u_int oflags; vm_paddr_t pa; PMAP_ASSERT_LOCKED(pmap); rw_assert(&pvh_global_lock, RA_WLOCKED); if (va == vector_page) { pa = systempage.pv_pa; m = NULL; } else { if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); } nflags = 0; if (prot & VM_PROT_WRITE) nflags |= PVF_WRITE; if (prot & VM_PROT_EXECUTE) nflags |= PVF_EXEC; if ((flags & PMAP_ENTER_WIRED) != 0) nflags |= PVF_WIRED; PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, " "flags = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, flags)); if (pmap == kernel_pmap) { l2b = pmap_get_l2_bucket(pmap, va); if (l2b == NULL) l2b = pmap_grow_l2_bucket(pmap, va); } else { do_l2b_alloc: l2b = pmap_alloc_l2_bucket(pmap, va); if (l2b == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); goto do_l2b_alloc; } return (KERN_RESOURCE_SHORTAGE); } } ptep = &l2b->l2b_kva[l2pte_index(va)]; opte = *ptep; npte = pa; oflags = 0; if (opte) { /* * There is already a mapping at this address. * If the physical address is different, lookup the * vm_page. */ if (l2pte_pa(opte) != pa) opg = PHYS_TO_VM_PAGE(l2pte_pa(opte)); else opg = m; } else opg = NULL; if ((prot & (VM_PROT_ALL)) || (!m || m->md.pvh_attrs & PVF_REF)) { /* * - The access type indicates that we don't need * to do referenced emulation. * OR * - The physical page has already been referenced * so no need to re-do referenced emulation here. */ npte |= L2_S_PROTO; nflags |= PVF_REF; if (m && ((prot & VM_PROT_WRITE) != 0 || (m->md.pvh_attrs & PVF_MOD))) { /* * This is a writable mapping, and the * page's mod state indicates it has * already been modified. Make it * writable from the outset. */ nflags |= PVF_MOD; if (!(m->md.pvh_attrs & PVF_MOD)) vm_page_dirty(m); } if (m && opte) vm_page_aflag_set(m, PGA_REFERENCED); } else { /* * Need to do page referenced emulation. */ npte |= L2_TYPE_INV; } if (prot & VM_PROT_WRITE) { npte |= L2_S_PROT_W; if (m != NULL && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } if (m->md.pv_memattr != VM_MEMATTR_UNCACHEABLE) npte |= pte_l2_s_cache_mode; if (m && m == opg) { /* * We're changing the attrs of an existing mapping. */ oflags = pmap_modify_pv(m, pmap, va, PVF_WRITE | PVF_EXEC | PVF_WIRED | PVF_MOD | PVF_REF, nflags); /* * We may need to flush the cache if we're * doing rw-ro... */ if (pmap_is_current(pmap) && (oflags & PVF_NC) == 0 && (opte & L2_S_PROT_W) != 0 && (prot & VM_PROT_WRITE) == 0 && (opte & L2_TYPE_MASK) != L2_TYPE_INV) { cpu_dcache_wb_range(va, PAGE_SIZE); cpu_l2cache_wb_range(va, PAGE_SIZE); } } else { /* * New mapping, or changing the backing page * of an existing mapping. */ if (opg) { /* * Replacing an existing mapping with a new one. * It is part of our managed memory so we * must remove it from the PV list */ if ((pve = pmap_remove_pv(opg, pmap, va))) { /* note for patch: the oflags/invalidation was moved * because PG_FICTITIOUS pages could free the pve */ oflags = pve->pv_flags; /* * If the old mapping was valid (ref/mod * emulation creates 'invalid' mappings * initially) then make sure to frob * the cache. */ if ((oflags & PVF_NC) == 0 && l2pte_valid(opte)) { if (PV_BEEN_EXECD(oflags)) { pmap_idcache_wbinv_range(pmap, va, PAGE_SIZE); } else if (PV_BEEN_REFD(oflags)) { pmap_dcache_wb_range(pmap, va, PAGE_SIZE, TRUE, (oflags & PVF_WRITE) == 0); } } /* free/allocate a pv_entry for UNMANAGED pages if * this physical page is not/is already mapped. */ if (m && (m->oflags & VPO_UNMANAGED) && !m->md.pv_kva && TAILQ_EMPTY(&m->md.pv_list)) { pmap_free_pv_entry(pve); pve = NULL; } } else if (m && (!(m->oflags & VPO_UNMANAGED) || m->md.pv_kva || !TAILQ_EMPTY(&m->md.pv_list))) pve = pmap_get_pv_entry(); } else if (m && (!(m->oflags & VPO_UNMANAGED) || m->md.pv_kva || !TAILQ_EMPTY(&m->md.pv_list))) pve = pmap_get_pv_entry(); if (m) { if ((m->oflags & VPO_UNMANAGED)) { if (!TAILQ_EMPTY(&m->md.pv_list) || m->md.pv_kva) { KASSERT(pve != NULL, ("No pv")); nflags |= PVF_UNMAN; pmap_enter_pv(m, pve, pmap, va, nflags); } else m->md.pv_kva = va; } else { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); KASSERT(pve != NULL, ("No pv")); pmap_enter_pv(m, pve, pmap, va, nflags); } } } /* * Make sure userland mappings get the right permissions */ if (pmap != kernel_pmap && va != vector_page) { npte |= L2_S_PROT_U; } /* * Keep the stats up to date */ if (opte == 0) { l2b->l2b_occupancy++; pmap->pm_stats.resident_count++; } /* * If this is just a wiring change, the two PTEs will be * identical, so there's no need to update the page table. */ if (npte != opte) { boolean_t is_cached = pmap_is_current(pmap); *ptep = npte; if (is_cached) { /* * We only need to frob the cache/tlb if this pmap * is current */ PTE_SYNC(ptep); if (L1_IDX(va) != L1_IDX(vector_page) && l2pte_valid(npte)) { /* * This mapping is likely to be accessed as * soon as we return to userland. Fix up the * L1 entry to avoid taking another * page/domain fault. */ pd_entry_t *pl1pd, l1pd; pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)]; l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) | L1_C_PROTO; if (*pl1pd != l1pd) { *pl1pd = l1pd; PTE_SYNC(pl1pd); } } } if (PV_BEEN_EXECD(oflags)) pmap_tlb_flushID_SE(pmap, va); else if (PV_BEEN_REFD(oflags)) pmap_tlb_flushD_SE(pmap, va); if (m) pmap_fix_cache(m, pmap, va); } return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { pmap_enter_locked(pmap, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * XXX Wired mappings of unmanaged pages cannot be counted by this pmap * implementation. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; pv_entry_t pv; vm_offset_t next_bucket; vm_page_t m; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (sva < eva) { next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pmap, sva); if (l2b == NULL) { sva = next_bucket; continue; } for (ptep = &l2b->l2b_kva[l2pte_index(sva)]; sva < next_bucket; sva += PAGE_SIZE, ptep++) { if ((pte = *ptep) == 0 || (m = PHYS_TO_VM_PAGE(l2pte_pa(pte))) == NULL || (m->oflags & VPO_UNMANAGED) != 0) continue; pv = pmap_find_pv(m, pmap, sva); if ((pv->pv_flags & PVF_WIRED) == 0) panic("pmap_unwire: pv %p isn't wired", pv); pv->pv_flags &= ~PVF_WIRED; pmap->pm_stats.wired_count--; } } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; PMAP_LOCK(pmap); pa = pmap_extract_locked(pmap, va); PMAP_UNLOCK(pmap); return (pa); } static vm_paddr_t pmap_extract_locked(pmap_t pmap, vm_offset_t va) { struct l2_dtable *l2; pd_entry_t l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; if (pmap != kernel_pmap) PMAP_ASSERT_LOCKED(pmap); l1idx = L1_IDX(va); l1pd = pmap->pm_l1->l1_kva[l1idx]; if (l1pte_section_p(l1pd)) { /* * These should only happen for the kernel pmap. */ KASSERT(pmap == kernel_pmap, ("unexpected section")); /* XXX: what to do about the bits > 32 ? */ if (l1pd & L1_S_SUPERSEC) pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET); else pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET); } else { /* * Note that we can't rely on the validity of the L1 * descriptor as an indication that a mapping exists. * We have to look it up in the L2 dtable. */ l2 = pmap->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) return (0); pte = ptep[l2pte_index(va)]; if (pte == 0) return (0); if ((pte & L2_TYPE_MASK) == L2_TYPE_L) pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET); else pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET); } return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. * */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct l2_dtable *l2; pd_entry_t l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; u_int l1idx; l1idx = L1_IDX(va); m = NULL; PMAP_LOCK(pmap); l1pd = pmap->pm_l1->l1_kva[l1idx]; if (l1pte_section_p(l1pd)) { /* * These should only happen for kernel_pmap */ KASSERT(pmap == kernel_pmap, ("huh")); /* XXX: what to do about the bits > 32 ? */ if (l1pd & L1_S_SUPERSEC) pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET); else pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET); if (l1pd & L1_S_PROT_W || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } else { /* * Note that we can't rely on the validity of the L1 * descriptor as an indication that a mapping exists. * We have to look it up in the L2 dtable. */ l2 = pmap->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { PMAP_UNLOCK(pmap); return (NULL); } ptep = &ptep[l2pte_index(va)]; pte = *ptep; if (pte == 0) { PMAP_UNLOCK(pmap); return (NULL); } if (pte & L2_S_PROT_W || (prot & VM_PROT_WRITE) == 0) { if ((pte & L2_TYPE_MASK) == L2_TYPE_L) pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET); else pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) { struct l2_dtable *l2; pd_entry_t l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; l1idx = L1_IDX(va); l1pd = kernel_pmap->pm_l1->l1_kva[l1idx]; if (l1pte_section_p(l1pd)) { if (l1pd & L1_S_SUPERSEC) pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET); else pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET); pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE); } else { l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { pte = 0; pa = 0; goto out; } pte = ptep[l2pte_index(va)]; if (pte == 0) { pa = 0; goto out; } if ((pte & L2_TYPE_MASK) == L2_TYPE_L) pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET); else pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET); } out: if (pte2p != NULL) *pte2p = pte; return (pa); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap)); pmap_alloc_l1(pmap); bzero(pmap->pm_l2, sizeof(pmap->pm_l2)); CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_stats.resident_count = 1; if (vector_page < KERNBASE) { pmap_enter(pmap, vector_page, PHYS_TO_VM_PAGE(systempage.pv_pa), VM_PROT_READ, PMAP_ENTER_WIRED | VM_PROT_READ, 0); } return (1); } /*************************************************** * page management routines. ***************************************************/ static void pmap_free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t pmap_get_pv_entry(void) { pv_entry_t ret_value; pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(0); /* XXX ARM NUMA */ ret_value = uma_zalloc(pvzone, M_NOWAIT); return ret_value; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ #define PMAP_REMOVE_CLEAN_LIST_SIZE 3 void pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct l2_bucket *l2b; vm_offset_t next_bucket; pt_entry_t *ptep; u_int total; u_int mappings, is_exec, is_refd; int flushall = 0; /* * we lock in the pmap => pv_head direction */ rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); total = 0; while (sva < eva) { /* * Do one L2 bucket's worth at a time. */ next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pm, sva); if (l2b == NULL) { sva = next_bucket; continue; } ptep = &l2b->l2b_kva[l2pte_index(sva)]; mappings = 0; while (sva < next_bucket) { struct vm_page *pg; pt_entry_t pte; vm_paddr_t pa; pte = *ptep; if (pte == 0) { /* * Nothing here, move along */ sva += PAGE_SIZE; ptep++; continue; } pm->pm_stats.resident_count--; pa = l2pte_pa(pte); is_exec = 0; is_refd = 1; /* * Update flags. In a number of circumstances, * we could cluster a lot of these and do a * number of sequential pages in one go. */ if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) { struct pv_entry *pve; pve = pmap_remove_pv(pg, pm, sva); if (pve) { is_exec = PV_BEEN_EXECD(pve->pv_flags); is_refd = PV_BEEN_REFD(pve->pv_flags); pmap_free_pv_entry(pve); } } if (l2pte_valid(pte) && pmap_is_current(pm)) { if (total < PMAP_REMOVE_CLEAN_LIST_SIZE) { total++; if (is_exec) { cpu_idcache_wbinv_range(sva, PAGE_SIZE); cpu_l2cache_wbinv_range(sva, PAGE_SIZE); cpu_tlb_flushID_SE(sva); } else if (is_refd) { cpu_dcache_wbinv_range(sva, PAGE_SIZE); cpu_l2cache_wbinv_range(sva, PAGE_SIZE); cpu_tlb_flushD_SE(sva); } } else if (total == PMAP_REMOVE_CLEAN_LIST_SIZE) { /* flushall will also only get set for * for a current pmap */ cpu_idcache_wbinv_all(); cpu_l2cache_wbinv_all(); flushall = 1; total++; } } *ptep = 0; PTE_SYNC(ptep); sva += PAGE_SIZE; ptep++; mappings++; } pmap_free_l2_bucket(pm, l2b, mappings); } rw_wunlock(&pvh_global_lock); if (flushall) cpu_tlb_flushID(); PMAP_UNLOCK(pm); } /* * pmap_zero_page() * * Zero a given physical page by mapping it at a page hook point. * In doing the zero page op, the page we zero is mapped cachable, as with * StrongARM accesses to non-cached pages are non-burst making writing * _any_ bulk data very slow. */ static void pmap_zero_page_generic(vm_paddr_t phys, int off, int size) { if (_arm_bzero && size >= _min_bzero_size && _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0) return; mtx_lock(&cmtx); /* * Hook in the page, zero it, invalidate the TLB as needed. * * Note the temporary zero-page mapping must be a non-cached page in * order to work without corruption when write-allocate is enabled. */ *cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE); PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); if (off || size != PAGE_SIZE) bzero((void *)(cdstp + off), size); else bzero_page(cdstp); mtx_unlock(&cmtx); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pmap_zero_page_generic(VM_PAGE_TO_PHYS(m), 0, PAGE_SIZE); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pmap_zero_page_generic(VM_PAGE_TO_PHYS(m), off, size); } #if 0 /* * pmap_clean_page() * * This is a local function used to work out the best strategy to clean * a single page referenced by its entry in the PV table. It should be used by * pmap_copy_page, pmap_zero page and maybe some others later on. * * Its policy is effectively: * o If there are no mappings, we don't bother doing anything with the cache. * o If there is one mapping, we clean just that page. * o If there are multiple mappings, we clean the entire cache. * * So that some functions can be further optimised, it returns 0 if it didn't * clean the entire cache, or 1 if it did. * * XXX One bug in this routine is that if the pv_entry has a single page * mapped at 0x00000000 a whole cache clean will be performed rather than * just the 1 page. Since this should not occur in everyday use and if it does * it will just result in not the most efficient clean for the page. * * We don't yet use this function but may want to. */ static int pmap_clean_page(struct pv_entry *pv, boolean_t is_src) { pmap_t pm, pm_to_clean = NULL; struct pv_entry *npv; u_int cache_needs_cleaning = 0; u_int flags = 0; vm_offset_t page_to_clean = 0; if (pv == NULL) { /* nothing mapped in so nothing to flush */ return (0); } /* * Since we flush the cache each time we change to a different * user vmspace, we only need to flush the page if it is in the * current pmap. */ if (curthread) pm = vmspace_pmap(curproc->p_vmspace); else pm = kernel_pmap; for (npv = pv; npv; npv = TAILQ_NEXT(npv, pv_list)) { if (npv->pv_pmap == kernel_pmap || npv->pv_pmap == pm) { flags |= npv->pv_flags; /* * The page is mapped non-cacheable in * this map. No need to flush the cache. */ if (npv->pv_flags & PVF_NC) { #ifdef DIAGNOSTIC if (cache_needs_cleaning) panic("pmap_clean_page: " "cache inconsistency"); #endif break; } else if (is_src && (npv->pv_flags & PVF_WRITE) == 0) continue; if (cache_needs_cleaning) { page_to_clean = 0; break; } else { page_to_clean = npv->pv_va; pm_to_clean = npv->pv_pmap; } cache_needs_cleaning = 1; } } if (page_to_clean) { if (PV_BEEN_EXECD(flags)) pmap_idcache_wbinv_range(pm_to_clean, page_to_clean, PAGE_SIZE); else pmap_dcache_wb_range(pm_to_clean, page_to_clean, PAGE_SIZE, !is_src, (flags & PVF_WRITE) == 0); } else if (cache_needs_cleaning) { if (PV_BEEN_EXECD(flags)) pmap_idcache_wbinv_all(pm); else pmap_dcache_wbinv_all(pm); return (1); } return (0); } #endif /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ /* * pmap_copy_page() * * Copy one physical page into another, by mapping the pages into * hook points. The same comment regarding cachability as in * pmap_zero_page also applies here. */ static void pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst) { #if 0 struct vm_page *src_pg = PHYS_TO_VM_PAGE(src); #endif /* * Clean the source page. Hold the source page's lock for * the duration of the copy so that no other mappings can * be created while we have a potentially aliased mapping. */ #if 0 /* * XXX: Not needed while we call cpu_dcache_wbinv_all() in * pmap_copy_page(). */ (void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE); #endif /* * Map the pages into the page hook points, copy them, and purge * the cache for the appropriate page. Invalidate the TLB * as required. */ mtx_lock(&cmtx); *csrc_pte = L2_S_PROTO | src | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode; PTE_SYNC(csrc_pte); *cdst_pte = L2_S_PROTO | dst | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(csrcp); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); bcopy_page(csrcp, cdstp); mtx_unlock(&cmtx); cpu_dcache_inv_range(csrcp, PAGE_SIZE); cpu_dcache_wbinv_range(cdstp, PAGE_SIZE); cpu_l2cache_inv_range(csrcp, PAGE_SIZE); cpu_l2cache_wbinv_range(cdstp, PAGE_SIZE); } void pmap_copy_page_offs_generic(vm_paddr_t a_phys, vm_offset_t a_offs, vm_paddr_t b_phys, vm_offset_t b_offs, int cnt) { mtx_lock(&cmtx); *csrc_pte = L2_S_PROTO | a_phys | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode; PTE_SYNC(csrc_pte); *cdst_pte = L2_S_PROTO | b_phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(csrcp); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); bcopy((char *)csrcp + a_offs, (char *)cdstp + b_offs, cnt); mtx_unlock(&cmtx); cpu_dcache_inv_range(csrcp + a_offs, cnt); cpu_dcache_wbinv_range(cdstp + b_offs, cnt); cpu_l2cache_inv_range(csrcp + a_offs, cnt); cpu_l2cache_wbinv_range(cdstp + b_offs, cnt); } void pmap_copy_page(vm_page_t src, vm_page_t dst) { cpu_dcache_wbinv_all(); cpu_l2cache_wbinv_all(); if (_arm_memcpy && PAGE_SIZE >= _min_memcpy_size && _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst), (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0) return; pmap_copy_page_generic(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); } /* * We have code to do unmapped I/O. However, it isn't quite right and * causes un-page-aligned I/O to devices to fail (most notably newfs * or fsck). We give up a little performance to not allow unmapped I/O * to gain stability. */ int unmapped_buf_allowed = 0; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; vm_offset_t a_pg_offset, b_pg_offset; int cnt; cpu_dcache_wbinv_all(); cpu_l2cache_wbinv_all(); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); pmap_copy_page_offs_generic(VM_PAGE_TO_PHYS(a_pg), a_pg_offset, VM_PAGE_TO_PHYS(b_pg), b_pg_offset, cnt); xfersize -= cnt; a_offset += cnt; b_offset += cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { /* * Don't bother with a PCPU pageframe, since we don't support * SMP for anything pre-armv7. Use pmap_kenter() to ensure * caching is handled correctly for multiple mappings of the * same physical page. */ mtx_assert(&qmap_mtx, MA_NOTOWNED); mtx_lock(&qmap_mtx); pmap_kenter(qmap_addr, VM_PAGE_TO_PHYS(m)); return (qmap_addr); } void pmap_quick_remove_page(vm_offset_t addr) { KASSERT(addr == qmap_addr, ("pmap_quick_remove_page: invalid address")); mtx_assert(&qmap_mtx, MA_OWNED); pmap_kremove(addr); mtx_unlock(&qmap_mtx); } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) if ((pv->pv_flags & PVF_WIRED) != 0) count++; rw_wunlock(&pvh_global_lock); return (count); } /* * This function is advisory. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); return (pmap_clearbit(m, PVF_REF)); } boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); if (m->md.pvh_attrs & PVF_MOD) return (TRUE); return(FALSE); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no mappings can be modified. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; if (m->md.pvh_attrs & PVF_MOD) pmap_clearbit(m, PVF_MOD); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return ((m->md.pvh_attrs & PVF_REF) != 0); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (vm_page_xbusied(m) || (vm_page_aflags(m) & PGA_WRITEABLE) != 0) + if (vm_page_xbusied(m) || (m->aflags & PGA_WRITEABLE) != 0) pmap_clearbit(m, PVF_WRITE); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val; boolean_t managed; PMAP_LOCK(pmap); retry: l2b = pmap_get_l2_bucket(pmap, addr); if (l2b == NULL) { val = 0; goto out; } ptep = &l2b->l2b_kva[l2pte_index(addr)]; pte = *ptep; if (!l2pte_valid(pte)) { val = 0; goto out; } val = MINCORE_INCORE; if (pte & L2_S_PROT_W) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; managed = false; pa = l2pte_pa(pte); m = PHYS_TO_VM_PAGE(pa); if (m != NULL && !(m->oflags & VPO_UNMANAGED)) managed = true; if (managed) { /* * The ARM pmap tries to maintain a per-mapping * reference bit. The trouble is that it's kept in * the PV entry, not the PTE, so it's costly to access * here. You would need to acquire the pvh global * lock, call pmap_find_pv(), and introduce a custom * version of vm_page_pa_tryrelock() that releases and * reacquires the pvh global lock. In the end, I * doubt it's worthwhile. This may falsely report * the given address as referenced. */ if ((m->md.pvh_attrs & PVF_REF) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else out: PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { } #define BOOTSTRAP_DEBUG /* * pmap_map_section: * * Create a single section mapping. */ void pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pd_entry_t fl; KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2")); switch (cache) { case PTE_NOCACHE: default: fl = 0; break; case PTE_CACHE: fl = pte_l1_s_cache_mode; break; case PTE_PAGETABLE: fl = pte_l1_s_cache_mode_pt; break; } pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL); PTE_SYNC(&pde[va >> L1_S_SHIFT]); } /* * pmap_link_l2pt: * * Link the L2 page table specified by l2pv.pv_pa into the L1 * page table at the slot for "va". */ void pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv) { pd_entry_t *pde = (pd_entry_t *) l1pt, proto; u_int slot = va >> L1_S_SHIFT; proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO; #ifdef VERBOSE_INIT_ARM printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va); #endif pde[slot + 0] = proto | (l2pv->pv_pa + 0x000); PTE_SYNC(&pde[slot]); SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list); } /* * pmap_map_entry * * Create a single page mapping. */ void pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t fl; pt_entry_t *pte; KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin")); switch (cache) { case PTE_NOCACHE: default: fl = 0; break; case PTE_CACHE: fl = pte_l2_s_cache_mode; break; case PTE_PAGETABLE: fl = pte_l2_s_cache_mode_pt; break; } if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C) panic("pmap_map_entry: no L2 table for VA 0x%08x", va); pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK); if (pte == NULL) panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va); pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl; PTE_SYNC(&pte[l2pte_index(va)]); } /* * pmap_map_chunk: * * Map a chunk of memory using the most efficient mappings * possible (section. large page, small page) into the * provided L1 and L2 tables at the specified virtual address. */ vm_size_t pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, vm_size_t size, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t *pte, f1, f2s, f2l; vm_size_t resid; int i; resid = roundup2(size, PAGE_SIZE); if (l1pt == 0) panic("pmap_map_chunk: no L1 table provided"); #ifdef VERBOSE_INIT_ARM printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x " "prot=0x%x cache=%d\n", pa, va, size, resid, prot, cache); #endif switch (cache) { case PTE_NOCACHE: default: f1 = 0; f2l = 0; f2s = 0; break; case PTE_CACHE: f1 = pte_l1_s_cache_mode; f2l = pte_l2_l_cache_mode; f2s = pte_l2_s_cache_mode; break; case PTE_PAGETABLE: f1 = pte_l1_s_cache_mode_pt; f2l = pte_l2_l_cache_mode_pt; f2s = pte_l2_s_cache_mode_pt; break; } size = resid; while (resid > 0) { /* See if we can use a section mapping. */ if (L1_S_MAPPABLE_P(va, pa, resid)) { #ifdef VERBOSE_INIT_ARM printf("S"); #endif pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, prot) | f1 | L1_S_DOM(PMAP_DOMAIN_KERNEL); PTE_SYNC(&pde[va >> L1_S_SHIFT]); va += L1_S_SIZE; pa += L1_S_SIZE; resid -= L1_S_SIZE; continue; } /* * Ok, we're going to use an L2 table. Make sure * one is actually in the corresponding L1 slot * for the current VA. */ if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C) panic("pmap_map_chunk: no L2 table for VA 0x%08x", va); pte = (pt_entry_t *) kernel_pt_lookup( pde[L1_IDX(va)] & L1_C_ADDR_MASK); if (pte == NULL) panic("pmap_map_chunk: can't find L2 table for VA" "0x%08x", va); /* See if we can use a L2 large page mapping. */ if (L2_L_MAPPABLE_P(va, pa, resid)) { #ifdef VERBOSE_INIT_ARM printf("L"); #endif for (i = 0; i < 16; i++) { pte[l2pte_index(va) + i] = L2_L_PROTO | pa | L2_L_PROT(PTE_KERNEL, prot) | f2l; PTE_SYNC(&pte[l2pte_index(va) + i]); } va += L2_L_SIZE; pa += L2_L_SIZE; resid -= L2_L_SIZE; continue; } /* Use a small page mapping. */ #ifdef VERBOSE_INIT_ARM printf("P"); #endif pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s; PTE_SYNC(&pte[l2pte_index(va)]); va += PAGE_SIZE; pa += PAGE_SIZE; resid -= PAGE_SIZE; } #ifdef VERBOSE_INIT_ARM printf("\n"); #endif return (size); } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { /* * Remember the memattr in a field that gets used to set the appropriate * bits in the PTEs as mappings are established. */ m->md.pv_memattr = ma; /* * It appears that this function can only be called before any mappings * for the page are established on ARM. If this ever changes, this code * will need to walk the pv_list and make each of the existing mappings * uncacheable, being careful to sync caches and PTEs (and maybe * invalidate TLB?) for any current mapping it modifies. */ if (m->md.pv_kva != 0 || TAILQ_FIRST(&m->md.pv_list) != NULL) panic("Can't change memattr on page with existing mappings"); } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode == VM_MEMATTR_DEFAULT || mode == VM_MEMATTR_UNCACHEABLE); } Index: head/sys/arm/arm/pmap-v6.c =================================================================== --- head/sys/arm/arm/pmap-v6.c (revision 352406) +++ head/sys/arm/arm/pmap-v6.c (revision 352407) @@ -1,6976 +1,6976 @@ /*- * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause-FreeBSD * * Copyright (c) 1991 Regents of the University of California. * Copyright (c) 1994 John S. Dyson * Copyright (c) 1994 David Greenman * Copyright (c) 2005-2010 Alan L. Cox * Copyright (c) 2014-2016 Svatopluk Kraus * Copyright (c) 2014-2016 Michal Meloun * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include "opt_pmap.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #ifndef DIAGNOSTIC #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PMAP_DEBUG static void pmap_zero_page_check(vm_page_t m); void pmap_debug(int level); int pmap_pid_dump(int pid); #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 1; #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #endif /* PMAP_DEBUG */ /* * Level 2 page tables map definion ('max' is excluded). */ #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define UPT2V_MAX_ADDRESS \ ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) /* * Promotion to a 1MB (PTE1) page mapping requires that the corresponding * 4KB (PTE2) page mappings have identical settings for the following fields: */ #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ PTE2_ATTR_MASK) #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ PTE1_ATTR_MASK) #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ (((l2_attr) & L2_C) ? L1_S_C : 0) | \ (((l2_attr) & L2_B) ? L1_S_B : 0) | \ (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ (((l2_attr) & PTE2_W) ? PTE1_W : 0)) #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ (((l1_attr) & L1_S_C) ? L2_C : 0) | \ (((l1_attr) & L1_S_B) ? L2_B : 0) | \ (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ (((l1_attr) & PTE1_W) ? PTE2_W : 0)) /* * PTE2 descriptors creation macros. */ #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * The boot_pt1 is used temporary in very early boot stage as L1 page table. * We can init many things with no memory allocation thanks to its static * allocation and this brings two main advantages: * (1) other cores can be started very simply, * (2) various boot loaders can be supported as its arguments can be processed * in virtual address space and can be moved to safe location before * first allocation happened. * Only disadvantage is that boot_pt1 is used only in very early boot stage. * However, the table is uninitialized and so lays in bss. Therefore kernel * image size is not influenced. * * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and * CPU suspend/resume game. */ extern pt1_entry_t boot_pt1[]; vm_paddr_t base_pt1; pt1_entry_t *kern_pt1; pt2_entry_t *kern_pt2tab; pt2_entry_t *PT2MAP; static uint32_t ttb_flags; static vm_memattr_t pt_memattr; ttb_entry_t pmap_kern_ttb; struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static vm_offset_t kernel_vm_end_new; vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; vm_offset_t vm_max_kernel_address; vm_paddr_t kernel_l1pa; static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ vm_paddr_t first_managed_pa; #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) /* * All those kernel PT submaps that BSD is so fond of */ caddr_t _tmppt = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt2_entry_t *PMAP1 = NULL, *PMAP2; static pt2_entry_t *PADDR1 = NULL, *PADDR2; #ifdef DDB static pt2_entry_t *PMAP3; static pt2_entry_t *PADDR3; static int PMAP3cpu __unused; /* for SMP only */ #endif #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte2_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte2_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte2_quick didn't change PMAP1"); static struct mtx PMAP2mutex; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static __inline void pt2_wirecount_init(vm_page_t m); static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va); static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, vm_page_t m); void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); /* * Function to set the debug level of the pmap code. */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ /* * This table must corespond with memory attribute configuration in vm.h. * First entry is used for normal system mapping. * * Device memory is always marked as shared. * Normal memory is shared only in SMP . * Not outer shareable bits are not used yet. * Class 6 cannot be used on ARM11. */ #define TEXDEF_TYPE_SHIFT 0 #define TEXDEF_TYPE_MASK 0x3 #define TEXDEF_INNER_SHIFT 2 #define TEXDEF_INNER_MASK 0x3 #define TEXDEF_OUTER_SHIFT 4 #define TEXDEF_OUTER_MASK 0x3 #define TEXDEF_NOS_SHIFT 6 #define TEXDEF_NOS_MASK 0x1 #define TEX(t, i, o, s) \ ((t) << TEXDEF_TYPE_SHIFT) | \ ((i) << TEXDEF_INNER_SHIFT) | \ ((o) << TEXDEF_OUTER_SHIFT | \ ((s) << TEXDEF_NOS_SHIFT)) static uint32_t tex_class[8] = { /* type inner cache outer cache */ TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ }; #undef TEX static uint32_t pte2_attr_tab[8] = { PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 0, /* 5 - NOT USED YET */ 0, /* 6 - NOT USED YET */ 0 /* 7 - NOT USED YET */ }; CTASSERT(VM_MEMATTR_WB_WA == 0); CTASSERT(VM_MEMATTR_NOCACHE == 1); CTASSERT(VM_MEMATTR_DEVICE == 2); CTASSERT(VM_MEMATTR_SO == 3); CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); #define VM_MEMATTR_END (VM_MEMATTR_WRITE_THROUGH + 1) boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < VM_MEMATTR_END); } static inline uint32_t vm_memattr_to_pte2(vm_memattr_t ma) { KASSERT((u_int)ma < VM_MEMATTR_END, ("%s: bad vm_memattr_t %d", __func__, ma)); return (pte2_attr_tab[(u_int)ma]); } static inline uint32_t vm_page_pte2_attr(vm_page_t m) { return (vm_memattr_to_pte2(m->md.pat_mode)); } /* * Convert TEX definition entry to TTB flags. */ static uint32_t encode_ttb_flags(int idx) { uint32_t inner, outer, nos, reg; inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; reg = nos << 5; reg |= outer << 3; if (cpuinfo.coherent_walk) reg |= (inner & 0x1) << 6; reg |= (inner & 0x2) >> 1; #ifdef SMP ARM_SMP_UP( reg |= 1 << 1, ); #endif return reg; } /* * Set TEX remapping registers in current CPU. */ void pmap_set_tex(void) { uint32_t prrr, nmrr; uint32_t type, inner, outer, nos; int i; #ifdef PMAP_PTE_NOCACHE /* XXX fixme */ if (cpuinfo.coherent_walk) { pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); } else { pt_memattr = VM_MEMATTR_NOCACHE; ttb_flags = encode_ttb_flags(1); } #else pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); #endif prrr = 0; nmrr = 0; /* Build remapping register from TEX classes. */ for (i = 0; i < 8; i++) { type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & TEXDEF_TYPE_MASK; inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; prrr |= type << (i * 2); prrr |= nos << (i + 24); nmrr |= inner << (i * 2); nmrr |= outer << (i * 2 + 16); } /* Add shareable bits for device memory. */ prrr |= PRRR_DS0 | PRRR_DS1; /* Add shareable bits for normal memory in SMP case. */ #ifdef SMP ARM_SMP_UP( prrr |= PRRR_NS1, ); #endif cp15_prrr_set(prrr); cp15_nmrr_set(nmrr); /* Caches are disabled, so full TLB flush should be enough. */ tlb_flush_all_local(); } /* * Remap one vm_meattr class to another one. This can be useful as * workaround for SOC errata, e.g. if devices must be accessed using * SO memory class. * * !!! Please note that this function is absolutely last resort thing. * It should not be used under normal circumstances. !!! * * Usage rules: * - it shall be called after pmap_bootstrap_prepare() and before * cpu_mp_start() (thus only on boot CPU). In practice, it's expected * to be called from platform_attach() or platform_late_init(). * * - if remapping doesn't change caching mode, or until uncached class * is remapped to any kind of cached one, then no other restriction exists. * * - if pmap_remap_vm_attr() changes caching mode, but both (original and * remapped) remain cached, then caller is resposible for calling * of dcache_wbinv_poc_all(). * * - remapping of any kind of cached class to uncached is not permitted. */ void pmap_remap_vm_attr(vm_memattr_t old_attr, vm_memattr_t new_attr) { int old_idx, new_idx; /* Map VM memattrs to indexes to tex_class table. */ old_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)old_attr]); new_idx = PTE2_ATTR2IDX(pte2_attr_tab[(int)new_attr]); /* Replace TEX attribute and apply it. */ tex_class[old_idx] = tex_class[new_idx]; pmap_set_tex(); } /* * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, * KERNBASE is mapped by first L2 page table in L2 page table page. It * meets same constrain due to PT2MAP being placed just under KERNBASE. */ CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); /* * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. * For now, anyhow, the following check must be fulfilled. */ CTASSERT(PAGE_SIZE == PTE2_SIZE); /* * We don't want to mess up MI code with all MMU and PMAP definitions, * so some things, which depend on other ones, are defined independently. * Now, it is time to check that we don't screw up something. */ CTASSERT(PDRSHIFT == PTE1_SHIFT); /* * Check L1 and L2 page table entries definitions consistency. */ CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); /* * Check L2 page tables page consistency. */ CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); /* * Check PT2TAB consistency. * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. * This should be done without remainder. */ CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); /* * A PT2MAP magic. * * All level 2 page tables (PT2s) are mapped continuously and accordingly * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page * must be used together, but not necessary at once. The first PT2 in a page * must map things on correctly aligned address and the others must follow * in right order. */ #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) /* * Check PT2TAB consistency. * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. * The both should be done without remainder. */ CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); /* * The implementation was made general, however, with the assumption * bellow in mind. In case of another value of NPG_IN_PT2TAB, * the code should be once more rechecked. */ CTASSERT(NPG_IN_PT2TAB == 1); /* * Get offset of PT2 in a page * associated with given PT1 index. */ static __inline u_int page_pt2off(u_int pt1_idx) { return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); } /* * Get physical address of PT2 * associated with given PT2s page and PT1 index. */ static __inline vm_paddr_t page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) { return (pgpa + page_pt2off(pt1_idx)); } /* * Get first entry of PT2 * associated with given PT2s page and PT1 index. */ static __inline pt2_entry_t * page_pt2(vm_offset_t pgva, u_int pt1_idx) { return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); } /* * Get virtual address of PT2s page (mapped in PT2MAP) * which holds PT2 which holds entry which maps given virtual address. */ static __inline vm_offset_t pt2map_pt2pg(vm_offset_t va) { va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); return ((vm_offset_t)pt2map_entry(va)); } /***************************************************************************** * * THREE pmap initialization milestones exist: * * locore.S * -> fundamental init (including MMU) in ASM * * initarm() * -> fundamental init continues in C * -> first available physical address is known * * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) * -> basic (safe) interface for physical address allocation is made * -> basic (safe) interface for virtual mapping is made * -> limited not SMP coherent work is possible * * -> more fundamental init continues in C * -> locks and some more things are available * -> all fundamental allocations and mappings are done * * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) * -> phys_avail[] and virtual_avail is set * -> control is passed to vm subsystem * -> physical and virtual address allocation are off limit * -> low level mapping functions, some SMP coherent, * are available, which cannot be used before vm subsystem * is being inited * * mi_startup() * -> vm subsystem is being inited * * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) * -> pmap is fully inited * *****************************************************************************/ /***************************************************************************** * * PMAP first stage initialization and utility functions * for pre-bootstrap epoch. * * After pmap_bootstrap_prepare() is called, the following functions * can be used: * * (1) strictly only for this stage functions for physical page allocations, * virtual space allocations, and mappings: * * vm_paddr_t pmap_preboot_get_pages(u_int num); * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); * vm_offset_t pmap_preboot_reserve_pages(u_int num); * vm_offset_t pmap_preboot_get_vpages(u_int num); * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, * vm_prot_t prot, vm_memattr_t attr); * * (2) for all stages: * * vm_paddr_t pmap_kextract(vm_offset_t va); * * NOTE: This is not SMP coherent stage. * *****************************************************************************/ #define KERNEL_P2V(pa) \ ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) #define KERNEL_V2P(va) \ ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) static vm_paddr_t last_paddr; /* * Pre-bootstrap epoch page allocator. */ vm_paddr_t pmap_preboot_get_pages(u_int num) { vm_paddr_t ret; ret = last_paddr; last_paddr += num * PAGE_SIZE; return (ret); } /* * The fundamental initialization of PMAP stuff. * * Some things already happened in locore.S and some things could happen * before pmap_bootstrap_prepare() is called, so let's recall what is done: * 1. Caches are disabled. * 2. We are running on virtual addresses already with 'boot_pt1' * as L1 page table. * 3. So far, all virtual addresses can be converted to physical ones and * vice versa by the following macros: * KERNEL_P2V(pa) .... physical to virtual ones, * KERNEL_V2P(va) .... virtual to physical ones. * * What is done herein: * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. * 2. PT2MAP magic is brought to live. * 3. Basic preboot functions for page allocations and mappings can be used. * 4. Everything is prepared for L1 cache enabling. * * Variations: * 1. To use second TTB register, so kernel and users page tables will be * separated. This way process forking - pmap_pinit() - could be faster, * it saves physical pages and KVA per a process, and it's simple change. * However, it will lead, due to hardware matter, to the following: * (a) 2G space for kernel and 2G space for users. * (b) 1G space for kernel in low addresses and 3G for users above it. * A question is: Is the case (b) really an option? Note that case (b) * does save neither physical memory and KVA. */ void pmap_bootstrap_prepare(vm_paddr_t last) { vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; vm_offset_t pt2pg_va; pt1_entry_t *pte1p; pt2_entry_t *pte2p; u_int i; uint32_t l1_attr; /* * Now, we are going to make real kernel mapping. Note that we are * already running on some mapping made in locore.S and we expect * that it's large enough to ensure nofault access to physical memory * allocated herein before switch. * * As kernel image and everything needed before are and will be mapped * by section mappings, we align last physical address to PTE1_SIZE. */ last_paddr = pte1_roundup(last); /* * Allocate and zero page(s) for kernel L1 page table. * * Note that it's first allocation on space which was PTE1_SIZE * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. */ base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); bzero((void*)kern_pt1, NB_IN_PT1); pte1_sync_range(kern_pt1, NB_IN_PT1); /* Allocate and zero page(s) for kernel PT2TAB. */ pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); bzero(kern_pt2tab, NB_IN_PT2TAB); pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); /* Allocate and zero page(s) for kernel L2 page tables. */ pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); pt2pg_va = KERNEL_P2V(pt2pg_pa); size = NKPT2PG * PAGE_SIZE; bzero((void*)pt2pg_va, size); pte2_sync_range((pt2_entry_t *)pt2pg_va, size); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated pages for kernel L2 page tables so that vm_page * structures representing these pages will be created. The vm_page * structures are required for promotion of the corresponding kernel * virtual addresses to section mappings. */ vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); /* * Insert allocated L2 page table pages to PT2TAB and make * link to all PT2s in L1 page table. See how kernel_vm_end * is initialized. * * We play simple and safe. So every KVA will have underlaying * L2 page table, even kernel image mapped by sections. */ pte2p = kern_pt2tab_entry(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) pt2tab_store(pte2p++, PTE2_KPT(pa)); pte1p = kern_pte1(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) pte1_store(pte1p++, PTE1_LINK(pa)); /* Make section mappings for kernel. */ l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); pte1p = kern_pte1(KERNBASE); for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); /* * Get free and aligned space for PT2MAP and make L1 page table links * to L2 page tables held in PT2TAB. * * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus * each entry in PT2TAB maps all PT2s in a page. This implies that * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. */ PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); pte1p = kern_pte1((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. * Each pmap will hold own PT2TAB, so the mapping should be not global. */ pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* * Choose correct L2 page table and make mappings for allocations * made herein which replaces temporary locore.S mappings after a while. * Note that PT2MAP cannot be used until we switch to kern_pt1. * * Note, that these allocations started aligned on 1M section and * kernel PT1 was allocated first. Making of mappings must follow * order of physical allocations as we've used KERNEL_P2V() macro * for virtual addresses resolution. */ pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); /* Make mapping for kernel L1 page table. */ for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Make mapping for kernel PT2TAB. */ for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ pmap_kern_ttb = base_pt1 | ttb_flags; cpuinfo_reinit_mmu(pmap_kern_ttb); /* * Initialize the first available KVA. As kernel image is mapped by * sections, we are leaving some gap behind. */ virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; } /* * Setup L2 page table page for given KVA. * Used in pre-bootstrap epoch. * * Note that we have allocated NKPT2PG pages for L2 page tables in advance * and used them for mapping KVA starting from KERNBASE. However, this is not * enough. Vectors and devices need L2 page tables too. Note that they are * even above VM_MAX_KERNEL_ADDRESS. */ static __inline vm_paddr_t pmap_preboot_pt2pg_setup(vm_offset_t va) { pt2_entry_t *pte2p, pte2; vm_paddr_t pt2pg_pa; /* Get associated entry in PT2TAB. */ pte2p = kern_pt2tab_entry(va); /* Just return, if PT2s page exists already. */ pte2 = pt2tab_load(pte2p); if (pte2_is_valid(pte2)) return (pte2_pa(pte2)); KASSERT(va >= VM_MAX_KERNEL_ADDRESS, ("%s: NKPT2PG too small", __func__)); /* * Allocate page for PT2s and insert it to PT2TAB. * In other words, map it into PT2MAP space. */ pt2pg_pa = pmap_preboot_get_pages(1); pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); /* Zero all PT2s in allocated page. */ bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); return (pt2pg_pa); } /* * Setup L2 page table for given KVA. * Used in pre-bootstrap epoch. */ static void pmap_preboot_pt2_setup(vm_offset_t va) { pt1_entry_t *pte1p; vm_paddr_t pt2pg_pa, pt2_pa; /* Setup PT2's page. */ pt2pg_pa = pmap_preboot_pt2pg_setup(va); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); /* Insert PT2 to PT1. */ pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } /* * Get L2 page entry associated with given KVA. * Used in pre-bootstrap epoch. */ static __inline pt2_entry_t* pmap_preboot_vtopte2(vm_offset_t va) { pt1_entry_t *pte1p; /* Setup PT2 if needed. */ pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ pmap_preboot_pt2_setup(va); return (pt2map_entry(va)); } /* * Pre-bootstrap epoch page(s) mapping(s). */ void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) { u_int i; pt2_entry_t *pte2p; /* Map all the pages. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KRW(pa)); va += PAGE_SIZE; pa += PAGE_SIZE; } } /* * Pre-bootstrap epoch virtual space alocator. */ vm_offset_t pmap_preboot_reserve_pages(u_int num) { u_int i; vm_offset_t start, va; pt2_entry_t *pte2p; /* Allocate virtual space. */ start = va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Zero the mapping. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, 0); va += PAGE_SIZE; } return (start); } /* * Pre-bootstrap epoch page(s) allocation and mapping(s). */ vm_offset_t pmap_preboot_get_vpages(u_int num) { vm_paddr_t pa; vm_offset_t va; /* Allocate physical page(s). */ pa = pmap_preboot_get_pages(num); /* Allocate virtual space. */ va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Map and zero all. */ pmap_preboot_map_pages(pa, va, num); bzero((void *)va, num * PAGE_SIZE); return (va); } /* * Pre-bootstrap epoch page mapping(s) with attributes. */ void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, vm_prot_t prot, vm_memattr_t attr) { u_int num; u_int l1_attr, l1_prot, l2_prot, l2_attr; pt1_entry_t *pte1p; pt2_entry_t *pte2p; l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l2_attr = vm_memattr_to_pte2(attr); l1_prot = ATTR_TO_L1(l2_prot); l1_attr = ATTR_TO_L1(l2_attr); /* Map all the pages. */ num = round_page(size); while (num > 0) { if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); va += PTE1_SIZE; pa += PTE1_SIZE; num -= PTE1_SIZE; } else { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); va += PAGE_SIZE; pa += PAGE_SIZE; num -= PAGE_SIZE; } } } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". */ vm_paddr_t pmap_kextract(vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); } else if (pte1_is_link(pte1)) { /* * We should beware of concurrent promotion that changes * pte1 at this point. However, it's not a problem as PT2 * page is preserved by promotion in PT2TAB. So even if * it happens, using of PT2MAP is still safe. * * QQQ: However, concurrent removing is a problem which * ends in abort on PT2MAP space. Locking must be used * to deal with this. */ pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2) | (va & PTE2_OFFSET); } else { panic("%s: va %#x pte1 %#x", __func__, va, pte1); } return (pa); } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". Also * return L2 page table entry which maps the address. * * This is only intended to be used for panic dumps. */ vm_paddr_t pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2); } else { pte2 = 0; pa = 0; } if (pte2p != NULL) *pte2p = pte2; return (pa); } /***************************************************************************** * * PMAP second stage initialization and utility functions * for bootstrap epoch. * * After pmap_bootstrap() is called, the following functions for * mappings can be used: * * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); * void pmap_kremove(vm_offset_t va); * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, * int prot); * * NOTE: This is not SMP coherent stage. And physical page allocation is not * allowed during this stage. * *****************************************************************************/ /* * Initialize kernel PMAP locks and lists, kernel_pmap itself, and * reserve various virtual spaces for temporary mappings. */ void pmap_bootstrap(vm_offset_t firstaddr) { pt2_entry_t *unused __unused; struct pcpu *pc; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ kernel_pmap->pm_pt1 = kern_pt1; kernel_pmap->pm_pt2tab = kern_pt2tab; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) do { \ v = (c)pmap_preboot_reserve_pages(n); \ p = pt2map_entry((vm_offset_t)v); \ } while (0) /* * Local CMAP1/CMAP2 are used for zeroing and copying pages. * Local CMAP2 is also used for data cache cleaning. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap1_pte2p, pc->pc_cmap1_addr, 1); SYSMAP(caddr_t, pc->pc_cmap2_pte2p, pc->pc_cmap2_addr, 1); SYSMAP(vm_offset_t, pc->pc_qmap_pte2p, pc->pc_qmap_addr, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); /* * _tmppt is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, _tmppt, 1); /* * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), * respectively. PADDR3 is used by pmap_pte2_ddb(). */ SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); #ifdef DDB SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); #endif mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); /* * Note that in very short time in initarm(), we are going to * initialize phys_avail[] array and no further page allocation * can happen after that until vm subsystem will be initialized. */ kernel_vm_end_new = kernel_vm_end; virtual_end = vm_max_kernel_address; } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap1_addr != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap1_pte2p = pt2map_entry(pages); pc->pc_cmap2_pte2p = pt2map_entry(pages + PAGE_SIZE); pc->pc_qmap_pte2p = pt2map_entry(pages + (PAGE_SIZE * 2)); pc->pc_cmap1_addr = (caddr_t)pages; pc->pc_cmap2_addr = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * The function can already be use in second initialization stage. * As such, the function DOES NOT call pmap_growkernel() where PT2 * allocation can happen. So if used, be sure that PT2 for given * virtual address is allocated already! * * Add a wired page to the kva. * Note: not SMP coherent. */ static __inline void pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, uint32_t attr) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ /* * This is a very low level function, so PT2 and particularly * PT2PG associated with given virtual address must be already * allocated. It's a pain mainly during pmap initialization * stage. However, called after pmap initialization with * virtual address not under kernel_vm_end will lead to * the same misery. */ if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) panic("%s: kernel PT2 not allocated!", __func__); } pte2p = pt2map_entry(va); pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); } PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (pte1_is_section(pte1_load(pte1p))) { pte1_clear(pte1p); } else { pte2p = pt2map_entry(va); pte2_clear(pte2p); } } /* * Share new kernel PT2PG with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) { pmap_t pmap; pt2_entry_t *pte2p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, npte2); } mtx_unlock_spin(&allpmaps_lock); } /* * Share new kernel PTE1 with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } mtx_unlock_spin(&allpmaps_lock); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * NOTE: Read the comments above pmap_kenter_prot_attr() as * the function is used herein! */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t pte1_offset; pt1_entry_t npte1; uint32_t l1prot, l2prot; uint32_t l1attr, l2attr; PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," " prot = %d\n", __func__, *virt, start, end, end - start, prot)); l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l1prot = ATTR_TO_L1(l2prot); l2attr = PTE2_ATTR_DEFAULT; l1attr = ATTR_TO_L1(l2attr); va = *virt; /* * Does the physical address range's size and alignment permit at * least one section mapping to be created? */ pte1_offset = start & PTE1_OFFSET; if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= PTE1_SIZE) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of section mappings. */ if ((va & PTE1_OFFSET) < pte1_offset) va = pte1_trunc(va) + pte1_offset; else if ((va & PTE1_OFFSET) > pte1_offset) va = pte1_roundup(va) + pte1_offset; } sva = va; while (start < end) { if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { KASSERT((va & PTE1_OFFSET) == 0, ("%s: misaligned va %#x", __func__, va)); npte1 = PTE1_KERN(start, l1prot, l1attr); pmap_kenter_pte1(va, npte1); va += PTE1_SIZE; start += PTE1_SIZE; } else { pmap_kenter_prot_attr(va, start, l2prot, l2attr); va += PAGE_SIZE; start += PAGE_SIZE; } } tlb_flush_range(sva, va - sva); *virt = va; return (sva); } /* * Make a temporary mapping for a physical address. * This is only intended to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); tlb_flush_local(va); return ((void *)crashdumpmap); } /************************************* * * TLB & cache maintenance routines. * *************************************/ /* * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_tlb_flush(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush(va); } PMAP_INLINE void pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush_range(sva, size); } /* * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PTE2_* bits * are ever set, PTE2_V in particular. * - Assumes we can write to pte2s without pte2_store() atomic ops. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PTE2_V. * - Assumes a vm_offset_t will fit in a pte2 (true for arm). * Because PTE2_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_pte2list_alloc(vm_offset_t *head) { pt2_entry_t *pte2p; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte2p = pt2map_entry(va); *head = *pte2p; if (*head & PTE2_V) panic("%s: va with PTE2_V set!", __func__); *pte2p = 0; return (va); } static void pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) { pt2_entry_t *pte2p; if (va & PTE2_V) panic("%s: freeing va with PTE2_V set!", __func__); pte2p = pt2map_entry(va); *pte2p = *head; /* virtual! PTE2_V is 0 though */ *head = va; } static void pmap_pte2list_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_pte2list_free(head, va); } } /***************************************************************************** * * PMAP third and final stage initialization. * * After pmap_init() is called, PMAP subsystem is fully initialized. * *****************************************************************************/ SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static u_long nkpt2pg = NKPT2PG; SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); static int sp_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &sp_enabled, 0, "Are large page mappings enabled?"); bool pmap_ps_enabled(pmap_t pmap __unused) { return (sp_enabled != 0); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, "1MB page mapping counters"); static u_long pmap_pte1_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pte1_demotions, 0, "1MB page demotions"); static u_long pmap_pte1_mappings; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pte1_mappings, 0, "1MB page mappings"); static u_long pmap_pte1_p_failures; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pte1_p_failures, 0, "1MB page promotion failures"); static u_long pmap_pte1_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pte1_promotions, 0, "1MB page promotions"); static u_long pmap_pte1_kern_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); static u_long pmap_pte1_kern_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); static __inline ttb_entry_t pmap_ttb_get(pmap_t pmap) { return (vtophys(pmap->pm_pt1) | ttb_flags); } /* * Initialize a vm_page's machine-dependent fields. * * Variations: * 1. Pages for L2 page tables are always not managed. So, pv_list and * pt2_wirecount can share same physical space. However, proper * initialization on a page alloc for page tables and reinitialization * on the page free must be ensured. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); pt2_wirecount_init(m); m->md.pat_mode = VM_MEMATTR_DEFAULT; } /* * Virtualization for faster way how to zero whole page. */ static __inline void pagezero(void *page) { bzero(page, PAGE_SIZE); } /* * Zero L2 page table page. * Use same KVA as in pmap_zero_page(). */ static __inline vm_paddr_t pmap_pt2pg_zero(vm_page_t m) { pt2_entry_t *cmap2_pte2p; vm_paddr_t pa; struct pcpu *pc; pa = VM_PAGE_TO_PHYS(m); /* * XXX: For now, we map whole page even if it's already zero, * to sync it even if the sync is only DSB. */ sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); /* Even VM_ALLOC_ZERO request is only advisory. */ if ((m->flags & PG_ZERO) == 0) pagezero(pc->pc_cmap2_addr); pte2_sync_range((pt2_entry_t *)pc->pc_cmap2_addr, PAGE_SIZE); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); return (pa); } /* * Init just allocated page as L2 page table(s) holder * and return its physical address. */ static __inline vm_paddr_t pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_paddr_t pa; pt2_entry_t *pte2p; /* Check page attributes. */ if (m->md.pat_mode != pt_memattr) pmap_page_set_memattr(m, pt_memattr); /* Zero page and init wire counts. */ pa = pmap_pt2pg_zero(m); pt2_wirecount_init(m); /* * Map page to PT2MAP address space for given pmap. * Note that PT2MAP space is shared with all pmaps. */ if (pmap == kernel_pmap) pmap_kenter_pt2tab(va, PTE2_KPT(pa)); else { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, PTE2_KPT_NG(pa)); } return (pa); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; pt2_entry_t *pte2p, pte2; u_int i, pte1_idx, pv_npg; PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); /* * Initialize the vm page array entries for kernel pmap's * L2 page table pages allocated in advance. */ pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { vm_paddr_t pa; vm_page_t m; pte2 = pte2_load(pte2p); KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: L2 page table page is out of range", __func__)); m->pindex = pte1_idx; m->phys_addr = pa; pte1_idx += NPT2_IN_PG; } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); if (sp_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("%s: can't assign to pagesizes[1]", __func__)); pagesizes[1] = PTE1_SIZE; } /* * Calculate the size of the pv head table for sections. * Handle the possibility that "vm_phys_segs[...].end" is zero. * Note that the table is only for sections which could be promoted. */ first_managed_pa = pte1_trunc(vm_phys_segs[0].start); pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) - first_managed_pa) / PTE1_SIZE + 1; /* * Allocate memory for the pv head table for sections. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("%s: not enough kvm for pv chunks", __func__); pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { u_int anychanged; pt2_entry_t *epte2p, *pte2p, pte2; vm_page_t m; vm_paddr_t pa; anychanged = 0; pte2p = pt2map_entry(sva); epte2p = pte2p + count; while (pte2p < epte2p) { m = *ma++; pa = VM_PAGE_TO_PHYS(m); pte2 = pte2_load(pte2p); if ((pte2_pa(pte2) != pa) || (pte2_attr(pte2) != vm_page_pte2_attr(m))) { anychanged++; pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); } pte2p++; } if (__predict_false(anychanged)) tlb_flush_range(sva, count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } tlb_flush_range(sva, va - sva); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); } /* * If the given pmap is not the current or kernel pmap, the returned * pte2 must be released by passing it to pmap_pte2_release(). */ static pt2_entry_t * pmap_pte2(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Releases a pte2 that was obtained from pmap_pte2(). * Be prepared for the pte2p being NULL. */ static __inline void pmap_pte2_release(pt2_entry_t *pte2p) { if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { mtx_unlock(&PMAP2mutex); } } /* * Super fast pmap_pte2 routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire tlb flush for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt2_entry_t * pmap_pte2_quick(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t *pte2p; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) pa = pte1_pa(pte1) | (va & PTE1_OFFSET); else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); pmap_pte2_release(pte2p); } else pa = 0; PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2, *pte2p; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) { if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (pte2_is_valid(pte2) && (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /* * Grow the number of kernel L2 page table entries, if needed. */ void pmap_growkernel(vm_offset_t addr) { vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pt1_entry_t pte1; pt2_entry_t pte2; PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); /* * All the time kernel_vm_end is first KVA for which underlying * L2 page table is either not allocated or linked from L1 page table * (not considering sections). Except for two possible cases: * * (1) in the very beginning as long as pmap_growkernel() was * not called, it could be first unused KVA (which is not * rounded up to PTE1_SIZE), * * (2) when all KVA space is mapped and vm_map_max(kernel_map) * address is not rounded up to PTE1_SIZE. (For example, * it could be 0xFFFFFFFF.) */ kernel_vm_end = pte1_roundup(kernel_vm_end); mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, PTE1_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pte1 = pte1_load(kern_pte1(kernel_vm_end)); if (pte1_is_valid(pte1)) { kernel_vm_end += PTE1_SIZE; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } /* * kernel_vm_end_new is used in pmap_pinit() when kernel * mappings are entered to new pmap all at once to avoid race * between pmap_kenter_pte1() and kernel_vm_end increase. * The same aplies to pmap_kenter_pt2tab(). */ kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into kernel PT2TAB. */ m = vm_page_alloc(NULL, pte1_index(kernel_vm_end) & ~PT2PG_MASK, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) panic("%s: no memory to grow kernel", __func__); /* * QQQ: To link all new L2 page tables from L1 page * table now and so pmap_kenter_pte1() them * at once together with pmap_kenter_pt2tab() * could be nice speed up. However, * pmap_growkernel() does not happen so often... * QQQ: The other TTBR is another option. */ pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, m); } else pt2pg_pa = pte2_pa(pte2); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); kernel_vm_end = kernel_vm_end_new; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = vm_max_kernel_address - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = vm_max_kernel_address - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /*********************************************** * * Pmap allocation/deallocation routines. * ***********************************************/ /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); PMAP_LOCK_INIT(pmap); /* * Kernel page table directory and pmap stuff around is already * initialized, we are using it right now and here. So, finish * only PMAP structures initialization for process0 ... * * Since the L1 page table and PT2TAB is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pt1 = kern_pt1; pmap->pm_pt2tab = kern_pt2tab; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); CPU_SET(0, &pmap->pm_active); } static __inline void pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pte1_index(sva); count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); bcopy(spte1p + idx, dpte1p + idx, count); } static __inline void pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pt2tab_index(sva); count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); bcopy(spte2p + idx, dpte2p + idx, count); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; vm_paddr_t pa, pt2tab_pa; u_int i; PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, pmap->pm_pt1)); /* * No need to allocate L2 page table space yet but we do need * a valid L1 page table and PT2TAB table. * * Install shared kernel mappings to these tables. It's a little * tricky as some parts of KVA are reserved for vectors, devices, * and whatever else. These parts are supposed to be above * vm_max_kernel_address. Thus two regions should be installed: * * (1) . * * QQQ: The second region should be stable enough to be installed * only once in time when the tables are allocated. * QQQ: Maybe copy of both regions at once could be faster ... * QQQ: Maybe the other TTBR is an option. * * Finally, install own PT2TAB table to these tables. */ if (pmap->pm_pt1 == NULL) { pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); if (pmap->pm_pt1 == NULL) return (0); } if (pmap->pm_pt2tab == NULL) { /* * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page * only, what should be the only size for 32 bit systems, * then we could allocate it with vm_page_alloc() and all * the stuff needed as other L2 page table pages. * (2) Note that a process PT2TAB is special L2 page table * page. Its mapping in kernel_arena is permanent and can * be used no matter which process is current. Its mapping * in PT2MAP can be used only for current process. */ pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); if (pmap->pm_pt2tab == NULL) { /* * QQQ: As struct pmap is allocated from UMA with * UMA_ZONE_NOFREE flag, it's important to leave * no allocation in pmap if initialization failed. */ kmem_free((vm_offset_t)pmap->pm_pt1, NB_IN_PT1); pmap->pm_pt1 = NULL; return (0); } /* * QQQ: Each L2 page table page vm_page_t has pindex set to * pte1 index of virtual address mapped by this page. * It's not valid for non kernel PT2TABs themselves. * The pindex of these pages can not be altered because * of the way how they are allocated now. However, it * should not be a problem. */ } mtx_lock_spin(&allpmaps_lock); /* * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), * kernel_vm_end_new is used here instead of kernel_vm_end. */ pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, kernel_vm_end_new - 1); pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 0xFFFFFFFF); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, kernel_vm_end_new - 1); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 0xFFFFFFFF); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. * I.e. self reference mapping. The PT2TAB is private, however mapped * into shared PT2MAP space, so the mapping should be not global. */ pt2tab_pa = vtophys(pmap->pm_pt2tab); pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* Insert PT2MAP PT2s into pmap PT1. */ pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Now synchronize new mapping which was made above. */ pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } #ifdef INVARIANTS static boolean_t pt2tab_user_is_empty(pt2_entry_t *tab) { u_int i, end; end = pt2tab_index(VM_MAXUSER_ADDRESS); for (i = 0; i < end; i++) if (tab[i] != 0) return (FALSE); return (TRUE); } #endif /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { #ifdef INVARIANTS vm_offset_t start, end; #endif KASSERT(pmap->pm_stats.resident_count == 0, ("%s: pmap resident count %ld != 0", __func__, pmap->pm_stats.resident_count)); KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), ("%s: has allocated user PT2(s)", __func__)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); #ifdef INVARIANTS start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); bzero((char *)pmap->pm_pt1 + start, end - start); start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); bzero((char *)pmap->pm_pt2tab + start, end - start); #endif /* * We are leaving PT1 and PT2TAB allocated on released pmap, * so hopefully UMA vmspace_zone will always be inited with * UMA_ZONE_NOFREE flag. */ } /********************************************************* * * L2 table pages and their pages management routines. * *********************************************************/ /* * Virtual interface for L2 page table wire counting. * * Each L2 page table in a page has own counter which counts a number of * valid mappings in a table. Global page counter counts mappings in all * tables in a page plus a single itself mapping in PT2TAB. * * During a promotion we leave the associated L2 page table counter * untouched, so the table (strictly speaking a page which holds it) * is never freed if promoted. * * If a page m->wire_count == 1 then no valid mappings exist in any L2 page * table in the page and the page itself is only mapped in PT2TAB. */ static __inline void pt2_wirecount_init(vm_page_t m) { u_int i; /* * Note: A page m is allocated with VM_ALLOC_WIRED flag and * m->wire_count should be already set correctly. * So, there is no need to set it again herein. */ for (i = 0; i < NPT2_IN_PG; i++) m->md.pt2_wirecount[i] = 0; } static __inline void pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) { /* * Note: A just modificated pte2 (i.e. already allocated) * is acquiring one extra reference which must be * explicitly cleared. It influences the KASSERTs herein. * All L2 page tables in a page always belong to the same * pmap, so we allow only one extra reference for the page. */ KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), ("%s: PT2 is overflowing ...", __func__)); KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowing ...", __func__)); m->wire_count++; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; } static __inline void pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) { KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, ("%s: PT2 is underflowing ...", __func__)); KASSERT(m->wire_count > 1, ("%s: PT2PG is underflowing ...", __func__)); m->wire_count--; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; } static __inline void pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) { KASSERT(count <= NPTE2_IN_PT2, ("%s: invalid count %u", __func__, count)); KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; m->wire_count += count; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); } static __inline uint32_t pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) { return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); } static __inline boolean_t pt2_is_empty(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); } static __inline boolean_t pt2_is_full(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == NPTE2_IN_PT2); } static __inline boolean_t pt2pg_is_empty(vm_page_t m) { return (m->wire_count == 1); } /* * This routine is called if the L2 page table * is not mapped correctly. */ static vm_page_t _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { uint32_t pte1_idx; pt1_entry_t *pte1p; pt2_entry_t pte2; vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pte1_idx = pte1_index(va); pte1p = pmap->pm_pt1 + pte1_idx; KASSERT(pte1_load(pte1p) == 0, ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, pte1_load(pte1p))); pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into pmap PT2TAB. */ m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, * the L2 page table page may have been allocated. */ return (NULL); } pmap->pm_stats.resident_count++; pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { pt2pg_pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pt2pg_pa); } pt2_wirecount_inc(m, pte1_idx); pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); return (m); } static vm_page_t pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { u_int pte1_idx; pt1_entry_t *pte1p, pte1; vm_page_t m; pte1_idx = pte1_index(va); retry: pte1p = pmap->pm_pt1 + pte1_idx; pte1 = pte1_load(pte1p); /* * This supports switching from a 1MB page to a * normal 4K page. */ if (pte1_is_section(pte1)) { (void)pmap_demote_pte1(pmap, pte1p, va); /* * Reload pte1 after demotion. * * Note: Demotion can even fail as either PT2 is not find for * the virtual address or PT2PG can not be allocated. */ pte1 = pte1_load(pte1p); } /* * If the L2 page table page is mapped, we just increment the * hold count, and activate it. */ if (pte1_is_link(pte1)) { m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(m, pte1_idx); } else { /* * Here if the PT2 isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte2(pmap, va, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /* * Schedule the specified unused L2 page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) { /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ #ifdef PMAP_DEBUG pmap_zero_page_check(m); #endif m->flags |= PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Unwire L2 page tables page. */ static void pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt1_entry_t *pte1p, opte1 __unused; pt2_entry_t *pte2p; uint32_t i; KASSERT(pt2pg_is_empty(m), ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); /* * Unmap all L2 page tables in the page from L1 page table. * * QQQ: Individual L2 page tables (except the last one) can be unmapped * earlier. However, we are doing that this way. */ KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); pte1p = pmap->pm_pt1 + m->pindex; for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { KASSERT(m->md.pt2_wirecount[i] == 0, ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); opte1 = pte1_load(pte1p); if (pte1_is_link(opte1)) { pte1_clear(pte1p); /* * Flush intermediate TLB cache. */ pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); } #ifdef INVARIANTS else KASSERT((opte1 == 0) || pte1_is_section(opte1), ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, pmap, va, opte1, i)); #endif } /* * Unmap the page from PT2TAB. */ pte2p = pmap_pt2tab_entry(pmap, va); (void)pt2tab_load_clear(pte2p); pmap_tlb_flush(pmap, pt2map_pt2pg(va)); m->wire_count = 0; pmap->pm_stats.resident_count--; /* * This barrier is so that the ordinary store unmapping * the L2 page table page is globally performed before TLB shoot- * down is begun. */ wmb(); vm_wire_sub(1); } /* * Decrements a L2 page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static __inline boolean_t pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { pt2_wirecount_dec(m, pte1_index(va)); if (pt2pg_is_empty(m)) { /* * QQQ: Wire count is zero, so whole page should be zero and * we can set PG_ZERO flag to it. * Note that when promotion is enabled, it takes some * more efforts. See pmap_unwire_pt2_all() below. */ pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); return (TRUE); } else return (FALSE); } /* * Drop a L2 page table page's wire count at once, which is used to record * the number of valid L2 page table entries within the page. If the wire * count drops to zero, then the L2 page table page is unmapped. */ static __inline void pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { u_int pte1_idx = pte1_index(va); KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, pt2_wirecount_get(m, pte1_idx))); /* * It's possible that the L2 page table was never used. * It happened in case that a section was created without promotion. */ if (pt2_is_full(m, va)) { pt2_wirecount_set(m, pte1_idx, 0); /* * QQQ: We clear L2 page table now, so when L2 page table page * is going to be freed, we can set it PG_ZERO flag ... * This function is called only on section mappings, so * hopefully it's not to big overload. * * XXX: If pmap is current, existing PT2MAP mapping could be * used for zeroing. */ pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); } #ifdef INVARIANTS else KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", __func__, pt2_wirecount_get(m, pte1_idx))); #endif if (pt2pg_is_empty(m)) { pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); } } /* * After removing a L2 page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static boolean_t pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt1_entry_t pte1; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (FALSE); pte1 = pte1_load(pmap_pte1(pmap, va)); mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); return (pmap_unwire_pt2(pmap, va, mpte, free)); } /************************************* * * Page management routines. * *************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * Is given page managed? */ static __inline bool is_managed(vm_paddr_t pa) { vm_page_t m; m = PHYS_TO_VM_PAGE(pa); if (m == NULL) return (false); return ((m->oflags & VPO_UNMANAGED) == 0); } static __inline bool pte1_is_managed(pt1_entry_t pte1) { return (is_managed(pte1_pa(pte1))); } static __inline bool pte2_is_managed(pt2_entry_t pte2) { return (is_managed(pte2_pa(pte2))); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pt1_entry_t *pte1p; pmap_t pmap; pt2_entry_t *pte2p, tpte2; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffs(inuse) - 1; pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) continue; pte2p = pmap_pte2(pmap, va); tpte2 = pte2_load(pte2p); if ((tpte2 & PTE2_W) == 0) tpte2 = pte2_load_clear(pte2p); pmap_pte2_release(pte2p); if ((tpte2 & PTE2_W) != 0) continue; KASSERT(tpte2 != 0, ("pmap_pv_reclaim: pmap %p va %#x zero pte", pmap, va)); pmap_tlb_flush(pmap, va); m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); if (pte2_is_dirty(tpte2)) vm_page_dirty(m); if ((tpte2 & PTE2_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt2(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; vm_wire_add(1); } vm_page_free_pages_toq(&free, false); return (m_pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire_noq(m); vm_page_free(m); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); } /* * Free the pv_entry back to the free list. */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } /* * Get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entries tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffs(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the pte2list "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_pte2list_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } static void pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); /* * Transfer the 1mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pte1: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 1mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a section. */ static bool pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags) { struct md_page *pvh; pv_entry_t pv; bool noreclaim; rw_assert(&pvh_global_lock, RA_WLOCKED); noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; if ((noreclaim && pv_entry_count >= pv_entry_high_water) || (pv = get_pv_entry(pmap, noreclaim)) == NULL) return (false); pv->pv_va = va; pvh = pa_to_pvh(pte1_pa(pte1)); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (true); } static inline void pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) { /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); else pmap_tlb_flush(pmap, pte1_trunc(va)); } /* * Update kernel pte1 on all pmaps. * * The following function is called only on one cpu with disabled interrupts. * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way * nobody can invoke explicit hardware table walk during the update of pte1. * Unsolicited hardware table walk can still happen, invoked by speculative * data or instruction prefetch or even by speculative hardware table walk. * * The break-before-make approach should be implemented here. However, it's * not so easy to do that for kernel mappings as it would be unhappy to unmap * itself unexpectedly but voluntarily. */ static void pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; /* * Get current pmap. Interrupts should be disabled here * so PCPU_GET() is done atomically. */ pmap = PCPU_GET(curpmap); if (pmap == NULL) pmap = kernel_pmap; /* * (1) Change pte1 on current pmap. * (2) Flush all obsolete TLB entries on current CPU. * (3) Change pte1 on all pmaps. * (4) Flush all obsolete TLB entries on all CPUs in SMP case. */ pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) { pmap_pte1_kern_promotions++; tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); } else { pmap_pte1_kern_demotions++; tlb_flush_local(pte1_trunc(va)); } /* * In SMP case, this function is called when all cpus are at smp * rendezvous, so there is no need to use 'allpmaps_lock' lock here. * In UP case, the function is called with this lock locked. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } #ifdef SMP /* Kill all the small mappings or the big one only. */ if (pte1_is_section(npte1)) tlb_flush_range(pte1_trunc(va), PTE1_SIZE); else tlb_flush(pte1_trunc(va)); #endif } #ifdef SMP struct pte1_action { vm_offset_t va; pt1_entry_t npte1; u_int update; /* CPU that updates the PTE1 */ }; static void pmap_update_pte1_action(void *arg) { struct pte1_action *act = arg; if (act->update == PCPU_GET(cpuid)) pmap_update_pte1_kernel(act->va, act->npte1); } /* * Change pte1 on current pmap. * Note that kernel pte1 must be changed on all pmaps. * * According to the architecture reference manual published by ARM, * the behaviour is UNPREDICTABLE when two or more TLB entries map the same VA. * According to this manual, UNPREDICTABLE behaviours must never happen in * a viable system. In contrast, on x86 processors, it is not specified which * TLB entry mapping the virtual address will be used, but the MMU doesn't * generate a bogus translation the way it does on Cortex-A8 rev 2 (Beaglebone * Black). * * It's a problem when either promotion or demotion is being done. The pte1 * update and appropriate TLB flush must be done atomically in general. */ static void pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, pt1_entry_t npte1) { if (pmap == kernel_pmap) { struct pte1_action act; sched_pin(); act.va = va; act.npte1 = npte1; act.update = PCPU_GET(cpuid); smp_rendezvous_cpus(all_cpus, smp_no_rendezvous_barrier, pmap_update_pte1_action, NULL, &act); sched_unpin(); } else { register_t cspr; /* * Use break-before-make approach for changing userland * mappings. It can cause L1 translation aborts on other * cores in SMP case. So, special treatment is implemented * in pmap_fault(). To reduce the likelihood that another core * will be affected by the broken mapping, disable interrupts * until the mapping change is completed. */ cspr = disable_interrupts(PSR_I | PSR_F); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); restore_interrupts(cspr); } } #else static void pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, pt1_entry_t npte1) { if (pmap == kernel_pmap) { mtx_lock_spin(&allpmaps_lock); pmap_update_pte1_kernel(va, npte1); mtx_unlock_spin(&allpmaps_lock); } else { register_t cspr; /* * Use break-before-make approach for changing userland * mappings. It's absolutely safe in UP case when interrupts * are disabled. */ cspr = disable_interrupts(PSR_I | PSR_F); pte1_clear(pte1p); pmap_tlb_flush_pte1(pmap, va, npte1); pte1_store(pte1p, npte1); restore_interrupts(cspr); } } #endif #if VM_NRESERVLEVEL > 0 /* * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are * within a single page table page (PT2) to a single 1MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PTE1s are replicated in each pmap but * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only * read the PTE1 from the kernel pmap. */ static void pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t npte1; pt2_entry_t *fpte2p, fpte2, fpte2_fav; pt2_entry_t *pte2p, pte2; vm_offset_t pteva __unused; vm_page_t m __unused; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is * either invalid, unused, or does not map the first 4KB physical page * within a 1MB page. */ fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); fpte2 = pte2_load(fpte2p); if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != (PTE2_A | PTE2_V)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", __func__, va, pmap); return; } if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", __func__, va, pmap); return; } if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set without * a TLB invalidation. */ fpte2 |= PTE2_RO; pte2_store(fpte2p, fpte2); } /* * Examine each of the other PTE2s in the specified PT2. Abort if this * PTE2 maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE2. */ fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { pte2 = pte2_load(pte2p); if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", __func__, va, pmap); return; } if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set * without a TLB invalidation. See note above. */ pte2 |= PTE2_RO; pte2_store(pte2p, pte2); pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & PTE2_FRAME); CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", __func__, pteva, pmap); } if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", __func__, va, pmap); return; } fpte2_fav -= PTE2_SIZE; } /* * The page table page in its current state will stay in PT2TAB * until the PTE1 mapping the section is demoted by pmap_demote_pte1() * or destroyed by pmap_remove_pte1(). * * Note that L2 page table size is not equal to PAGE_SIZE. */ m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: PT2 page is out of range", __func__)); KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); /* * Get pte1 from pte2 format. */ npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; /* * Promote the pv entries. */ if (pte2_is_managed(fpte2)) pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); /* * Promote the mappings. */ pmap_change_pte1(pmap, pte1p, va, npte1); pmap_pte1_promotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Zero L2 page table page. */ static __inline void pmap_clear_pt2(pt2_entry_t *fpte2p) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) pte2_clear(pte2p); } /* * Removes a 1MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { vm_page_t m; uint32_t pte1_idx; pt2_entry_t *fpte2p; vm_paddr_t pt2_pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); m = pmap_pt2_page(pmap, va); if (m == NULL) /* * QQQ: Is this function called only on promoted pte1? * We certainly do section mappings directly * (without promotion) in kernel !!! */ panic("%s: missing pt2 page", __func__); pte1_idx = pte1_index(va); /* * Initialize the L2 page table. */ fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); pmap_clear_pt2(fpte2p); /* * Remove the mapping. */ pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); /* * QQQ: We do not need to invalidate PT2MAP mapping * as we did not change it. I.e. the L2 page table page * was and still is mapped the same way. */ } /* * Do the things to unmap a section in a process */ static void pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, struct spglist *free) { pt1_entry_t opte1; struct md_page *pvh; vm_offset_t eva, va; vm_page_t m; PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); /* * Clear and invalidate the mapping. It should occupy one and only TLB * entry. So, pmap_tlb_flush() called with aligned address should be * sufficient. */ opte1 = pte1_load_clear(pte1p); pmap_tlb_flush(pmap, sva); if (pte1_is_wired(opte1)) pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; if (pte1_is_managed(opte1)) { pvh = pa_to_pvh(pte1_pa(opte1)); pmap_pvh_free(pvh, pmap, sva); eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) { if (pte1_is_dirty(opte1)) vm_page_dirty(m); if (opte1 & PTE1_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { /* * L2 page table(s) can't be removed from kernel map as * kernel counts on it (stuff around pmap_growkernel()). */ pmap_remove_kernel_pte1(pmap, pte1p, sva); } else { /* * Get associated L2 page table page. * It's possible that the page was never allocated. */ m = pmap_pt2_page(pmap, sva); if (m != NULL) pmap_unwire_pt2_all(pmap, sva, m, free); } } /* * Fills L2 page table page with mappings to consecutive physical pages. */ static __inline void pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { pte2_store(pte2p, npte2); npte2 += PTE2_SIZE; } } /* * Tries to demote a 1MB page mapping. If demotion fails, the * 1MB page mapping is invalidated. */ static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t opte1, npte1; pt2_entry_t *fpte2p, npte2; vm_paddr_t pt2pg_pa, pt2_pa; vm_page_t m; struct spglist free; uint32_t pte1_idx, isnew = 0; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); opte1 = pte1_load(pte1p); KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { KASSERT(!pte1_is_wired(opte1), ("%s: PT2 page for a wired mapping is missing", __func__)); /* * Invalidate the 1MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); vm_page_free_pages_toq(&free, false); CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", __func__, va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; isnew = 1; /* * We init all L2 page tables in the page even if * we are going to change everything for one L2 page * table in a while. */ pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { if (va < VM_MAXUSER_ADDRESS) { if (pt2_is_empty(m, va)) isnew = 1; /* Demoting section w/o promotion. */ #ifdef INVARIANTS else KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" " count %u", __func__, pt2_wirecount_get(m, pte1_index(va)))); #endif } } pt2pg_pa = VM_PAGE_TO_PHYS(m); pte1_idx = pte1_index(va); /* * If the pmap is current, then the PT2MAP can provide access to * the page table page (promoted L2 page tables are not unmapped). * Otherwise, temporarily map the L2 page table page (m) into * the kernel's address space at either PADDR1 or PADDR2. * * Note that L2 page table size is not equal to PAGE_SIZE. */ if (pmap_is_current(pmap)) fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); } else { mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); } pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); npte1 = PTE1_LINK(pt2_pa); KASSERT((opte1 & PTE1_A) != 0, ("%s: opte1 is missing PTE1_A", __func__)); KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, ("%s: opte1 has PTE1_NM", __func__)); /* * Get pte2 from pte1 format. */ npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; /* * If the L2 page table page is new, initialize it. If the mapping * has changed attributes, update the page table entries. */ if (isnew != 0) { pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); pmap_fill_pt2(fpte2p, npte2); } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != (npte2 & PTE2_PROMOTE)) pmap_fill_pt2(fpte2p, npte2); KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), ("%s: fpte2p and npte2 map different physical addresses", __func__)); if (fpte2p == PADDR2) mtx_unlock(&PMAP2mutex); /* * Demote the mapping. This pmap is locked. The old PTE1 has * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also * has not PTE1_NM set. Thus, there is no danger of a race with * another processor changing the setting of PTE1_A and/or PTE1_NM * between the read above and the store below. */ pmap_change_pte1(pmap, pte1p, va, npte1); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_pv_reclaim(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PTE1 must have already changed from mapping * the 1mpage to referencing the page table page. */ if (pte1_is_managed(opte1)) pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); pmap_pte1_demotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); return (TRUE); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pt2_entry_t npte2, opte2; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte2, om; int rv; va = trunc_page(va); KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("%s: managed mapping within the clean submap", __func__)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("%s: flags %u has reserved bits set", __func__, flags)); pa = VM_PAGE_TO_PHYS(m); npte2 = PTE2(pa, PTE2_A, vm_page_pte2_attr(m)); if ((flags & VM_PROT_WRITE) == 0) npte2 |= PTE2_NM; if ((prot & VM_PROT_WRITE) == 0) npte2 |= PTE2_RO; KASSERT((npte2 & (PTE2_NM | PTE2_RO)) != PTE2_RO, ("%s: flags includes VM_PROT_WRITE but prot doesn't", __func__)); if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; if ((flags & PMAP_ENTER_WIRED) != 0) npte2 |= PTE2_W; if (va < VM_MAXUSER_ADDRESS) npte2 |= PTE2_U; if (pmap != kernel_pmap) npte2 |= PTE2_NG; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PTE1_OFFSET) == 0, ("%s: va unaligned", __func__)); KASSERT(m->psind > 0, ("%s: m->psind < psind", __func__)); rv = pmap_enter_pte1(pmap, va, PTE1_PA(pa) | ATTR_TO_L1(npte2) | PTE1_V, flags, m); goto out; } /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte2 = pmap_allocpte2(pmap, va, flags); if (mpte2 == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte2 failed with sleep allowed")); rv = KERN_RESOURCE_SHORTAGE; goto out; } } else mpte2 = NULL; pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) panic("%s: attempted on 1MB page", __func__); pte2p = pmap_pte2_quick(pmap, va); if (pte2p == NULL) panic("%s: invalid L1 page table entry va=%#x", __func__, va); om = NULL; opte2 = pte2_load(pte2p); opa = pte2_pa(opte2); /* * Mapping has not changed, must be protection or wiring change. */ if (pte2_is_valid(opte2) && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT2 pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT2 page will be also. */ if (pte2_is_wired(npte2) && !pte2_is_wired(opte2)) pmap->pm_stats.wired_count++; else if (!pte2_is_wired(npte2) && pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; /* * Remove extra pte2 reference */ if (mpte2) pt2_wirecount_dec(mpte2, pte1_index(va)); if ((m->oflags & VPO_UNMANAGED) == 0) om = m; goto validate; } /* * QQQ: We think that changing physical address on writeable mapping * is not safe. Well, maybe on kernel address space with correct * locking, it can make a sense. However, we have no idea why * anyone should do that on user address space. Are we wrong? */ KASSERT((opa == 0) || (opa == pa) || !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", __func__, pmap, va, opte2, opa, pa, flags, prot)); pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; om = PHYS_TO_VM_PAGE(opa); if (om != NULL && (om->oflags & VPO_UNMANAGED) != 0) om = NULL; if (om != NULL) pv = pmap_pvh_remove(&om->md, pmap, va); /* * Remove extra pte2 reference */ if (mpte2 != NULL) pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (pte2_is_wired(npte2)) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ if (prot & VM_PROT_WRITE) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * If the mapping or permission bits are different, we need * to update the pte2. * * QQQ: Think again and again what to do * if the mapping is going to be changed! */ if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, there is a race * for other threads of current process in lazy loading case. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). * * QQQ: (1) Does it exist any better way where * or how to sync icache? * (2) Now, we do it on a page basis. */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pat_mode == VM_MEMATTR_WB_WA && (opa != pa || (opte2 & PTE2_NX))) cache_icache_sync_fresh(va, pa, PAGE_SIZE); if (opte2 & PTE2_V) { /* Change mapping with break-before-make approach. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); pte2_store(pte2p, npte2); if (om != NULL) { KASSERT((om->oflags & VPO_UNMANAGED) == 0, ("%s: om %p unmanaged", __func__, om)); if ((opte2 & PTE2_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); if (pte2_is_dirty(opte2)) vm_page_dirty(om); if (TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else pte2_store(pte2p, npte2); } #if 0 else { /* * QQQ: In time when both access and not mofified bits are * emulated by software, this should not happen. Some * analysis is need, if this really happen. Missing * tlb flush somewhere could be the reason. */ panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, va, opte2, npte2); } #endif #if VM_NRESERVLEVEL > 0 /* * If both the L2 page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pte1(pmap, pte1p, va); #endif rv = KERN_SUCCESS; out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Do the things to unmap a page in a process. */ static int pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, struct spglist *free) { pt2_entry_t opte2; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Clear and invalidate the mapping. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", __func__, pmap, va, opte2)); if (opte2 & PTE2_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte2_is_managed(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt2(pmap, va, free)); } /* * Remove a single page from a process address space. */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt2_entry_t *pte2p; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || !pte2_is_valid(pte2_load(pte2p))) return; pmap_remove_pte2(pmap, pte2p, va, free); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * Special handling of removing one page. A very common * operation and easy to short circuit some code. */ if (sva + PAGE_SIZE == eva) { pte1 = pte1_load(pmap_pte1(pmap, sva)); if (pte1_is_link(pte1)) { pmap_remove_page(pmap, sva, &free); goto out; } } for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; if (pmap->pm_stats.resident_count == 0) break; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that the L1 page * table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_remove_pte1(pmap, pte1p, sva, &free); continue; } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* The large page mapping was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion. */ pte1 = pte1_load(pte1p); } #endif } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being removed. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (pmap_remove_pte2(pmap, pte2p, sva, &free)) break; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt2_entry_t *pte2p, opte2; pt1_entry_t *pte1p; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " "a 1mpage in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, pv->pv_va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", __func__, pmap, pv->pv_va)); if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_unuse_pt2(pmap, pv->pv_va, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m, mt, mpt2pg; struct md_page *pvh; pa = pte1_pa(pte1); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte1 %#x", __func__, pte1)); if (pte1_is_dirty(pte1)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; pvh = pa_to_pvh(pa); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpt2pg = pmap_pt2_page(pmap, pv->pv_va); if (mpt2pg != NULL) pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m; struct md_page *pvh; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte2 %#x", __func__, pte2)); if (pte2_is_dirty(pte2)) vm_page_dirty(m); pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(pa); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt2(pmap, pv->pv_va, free); } /* * Remove all pages from specified address space this aids process * exit speeds. Also, this code is special cased for current process * only, but can have the more generic (and slightly slower) mode enabled. * This is much faster than pmap_remove in the case of running down * an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; pv_entry_t pv; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; boolean_t allfree; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), ("%s: non-current pmap %p", __func__, pmap)); #if defined(SMP) && defined(INVARIANTS) { cpuset_t other_cpus; sched_pin(); other_cpus = pmap->pm_active; CPU_CLR(PCPU_GET(cpuid), &other_cpus); sched_unpin(); KASSERT(CPU_EMPTY(&other_cpus), ("%s: pmap %p active on other cpus", __func__, pmap)); } #endif SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", __func__, pmap, pc->pc_pmap)); allfree = TRUE; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = ffs(inuse) - 1; bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; /* * Note that we cannot remove wired pages * from a process' mapping at this time */ pte1p = pmap_pte1(pmap, pv->pv_va); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) { allfree = FALSE; continue; } pte1_clear(pte1p); pmap_remove_pte1_quick(pmap, pte1, pv, &free); } else if (pte1_is_link(pte1)) { pte2p = pt2map_entry(pv->pv_va); pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) { printf("%s: pmap %p va %#x " "pte2 %#x\n", __func__, pmap, pv->pv_va, pte2); panic("bad pte2"); } if (pte2_is_wired(pte2)) { allfree = FALSE; continue; } pte2_clear(pte2p); pmap_remove_pte2_quick(pmap, pte2, pv, &free); } else { printf("%s: pmap %p va %#x pte1 %#x\n", __func__, pmap, pv->pv_va, pte1); panic("bad pte1"); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } tlb_flush_all_ng_local(); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * This code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No L2 page table pages. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpt2pg) { pt2_entry_t *pte2p, pte2; vm_paddr_t pa; struct spglist free; uint32_t l2prot; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("%s: managed mapping within the clean submap", __func__)); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a L2 page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int pte1_idx; pt1_entry_t pte1, *pte1p; vm_paddr_t pt2_pa; /* * Get L1 page table things. */ pte1_idx = pte1_index(va); pte1p = pmap_pte1(pmap, va); pte1 = pte1_load(pte1p); if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { /* * Each of NPT2_IN_PG L2 page tables on the page can * come here. Make sure that associated L1 page table * link is established. * * QQQ: It comes that we don't establish all links to * L2 page tables for newly allocated L2 page * tables page. */ KASSERT(!pte1_is_section(pte1), ("%s: pte1 %#x is section", __func__, pte1)); if (!pte1_is_link(pte1)) { pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } pt2_wirecount_inc(mpt2pg, pte1_idx); } else { /* * If the L2 page table page is mapped, we just * increment the hold count, and activate it. */ if (pte1_is_section(pte1)) { return (NULL); } else if (pte1_is_link(pte1)) { mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(mpt2pg, pte1_idx); } else { mpt2pg = _pmap_allocpte2(pmap, va, PMAP_ENTER_NOSLEEP); if (mpt2pg == NULL) return (NULL); } } } else { mpt2pg = NULL; } /* * This call to pt2map_entry() makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte2_quick(). * But that isn't as quick as pt2map_entry(). */ pte2p = pt2map_entry(va); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { if (mpt2pg != NULL) { /* * Remove extra pte2 reference */ pt2_wirecount_dec(mpt2pg, pte1_index(va)); mpt2pg = NULL; } return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpt2pg != NULL) { SLIST_INIT(&free); if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { pmap_tlb_flush(pmap, va); vm_page_free_pages_toq(&free, false); } mpt2pg = NULL; } return (NULL); } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ pa = VM_PAGE_TO_PHYS(m); l2prot = PTE2_RO | PTE2_NM; if (va < VM_MAXUSER_ADDRESS) l2prot |= PTE2_U | PTE2_NG; if ((prot & VM_PROT_EXECUTE) == 0) l2prot |= PTE2_NX; else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ cache_icache_sync_fresh(va, pa, PAGE_SIZE); } pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); return (mpt2pg); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Tries to create a read- and/or execute-only 1 MB page mapping. Returns * true if successful. Returns false if (1) a mapping already exists at the * specified virtual address or (2) a PV entry cannot be allocated without * reclaiming another PV entry. */ static bool pmap_enter_1mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pt1_entry_t pte1; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pa = VM_PAGE_TO_PHYS(m); pte1 = PTE1(pa, PTE1_NM | PTE1_RO, ATTR_TO_L1(vm_page_pte2_attr(m))); if ((prot & VM_PROT_EXECUTE) == 0) pte1 |= PTE1_NX; if (va < VM_MAXUSER_ADDRESS) pte1 |= PTE1_U; if (pmap != kernel_pmap) pte1 |= PTE1_NG; return (pmap_enter_pte1(pmap, va, pte1, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m) == KERN_SUCCESS); } /* * Tries to create the specified 1 MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NORECLAIM was specified and PV entry * allocation failed. */ static int pmap_enter_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t pte1, u_int flags, vm_page_t m) { struct spglist free; pt1_entry_t opte1, *pte1p; pt2_entry_t pte2, *pte2p; vm_offset_t cur, end; vm_page_t mt; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pte1 & (PTE1_NM | PTE1_RO)) == 0 || (pte1 & (PTE1_NM | PTE1_RO)) == (PTE1_NM | PTE1_RO), ("%s: pte1 has inconsistent NM and RO attributes", __func__)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pte1p = pmap_pte1(pmap, va); opte1 = pte1_load(pte1p); if (pte1_is_valid(opte1)) { if ((flags & PMAP_ENTER_NOREPLACE) != 0) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if (pte1_is_section(opte1)) { /* * If the section resulted from a promotion, then a * reserved PT page could be freed. */ pmap_remove_pte1(pmap, pte1p, va, &free); } else { sched_pin(); end = va + PTE1_SIZE; for (cur = va, pte2p = pmap_pte2_quick(pmap, va); cur != end; cur += PAGE_SIZE, pte2p++) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (pmap_remove_pte2(pmap, pte2p, cur, &free)) break; } sched_unpin(); } vm_page_free_pages_toq(&free, false); } if ((m->oflags & VPO_UNMANAGED) == 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pte1(pmap, va, pte1, flags)) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((pte1 & PTE1_RO) == 0) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if (pte1_is_wired(pte1)) pmap->pm_stats.wired_count += PTE1_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ if ((pte1 & PTE1_NX) == 0 && m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap && (!pte1_is_section(opte1) || pte1_pa(opte1) != VM_PAGE_TO_PHYS(m) || (opte1 & PTE2_NX) != 0)) cache_icache_sync_fresh(va, VM_PAGE_TO_PHYS(m), PTE1_SIZE); /* * Map the section. */ pte1_store(pte1p, pte1); pmap_pte1_mappings++; CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpt2pg; vm_pindex_t diff, psize; PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", __func__, pmap, start, end, m_start, prot)); VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpt2pg = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && m->psind == 1 && sp_enabled && pmap_enter_1mpage(pmap, va, m, prot)) m = &m[PTE1_SIZE / PAGE_SIZE - 1]; else mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, mpt2pg); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pt1_entry_t *pte1p; vm_paddr_t pa, pte2_pa; vm_page_t p; vm_memattr_t pat_mode; u_int l1attr, l1prot; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("%s: non-device object", __func__)); if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 1MB page boundary. */ pte2_pa = VM_PAGE_TO_PHYS(p); if (pte2_pa & PTE1_OFFSET) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 1MB pages. * * QQQ: Well, we are mapping a section, so same condition must * be hold like during promotion. It looks that only RW mapping * is done here, so readonly mapping must be done elsewhere. */ l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); PMAP_LOCK(pmap); for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { pte1p = pmap_pte1(pmap, addr); if (!pte1_is_valid(pte1_load(pte1p))) { pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } /* Else continue on if the PTE1 is already valid. */ addr += PTE1_SIZE; } PMAP_UNLOCK(pmap); } } /* * Do the things to protect a 1mpage in a process. */ static void pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, vm_prot_t prot) { pt1_entry_t npte1, opte1; vm_offset_t eva, va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); opte1 = npte1 = pte1_load(pte1p); if (pte1_is_managed(opte1) && pte1_is_dirty(opte1)) { eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) npte1 |= PTE1_RO | PTE1_NM; if ((prot & VM_PROT_EXECUTE) == 0) npte1 |= PTE1_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte1 != opte1) { pte1_store(pte1p, npte1); pmap_tlb_flush(pmap, sva); } } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { boolean_t pv_lists_locked; vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, opte2, npte2; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_protect_pte1(pmap, pte1p, sva, prot); continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping * was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { vm_page_t m; opte2 = npte2 = pte2_load(pte2p); if (!pte2_is_valid(opte2)) continue; if ((prot & VM_PROT_WRITE) == 0) { if (pte2_is_managed(opte2) && pte2_is_dirty(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); vm_page_dirty(m); } npte2 |= PTE2_RO | PTE2_NM; } if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte2 != opte2) { pte2_store(pte2p, npte2); pmap_tlb_flush(pmap, sva); } } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt1_entry_t pte1; pt2_entry_t pte2; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) count++; } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); if (pte2_is_wired(pte2)) count++; } PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 1mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = pte1_is_dirty(pte1); } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = pte2_is_dirty(pte2); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTE2s can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt1_entry_t pte1; pt2_entry_t pte2; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, addr)); if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(addr)); rv = !pte2_is_valid(pte2) ; } PMAP_UNLOCK(pmap); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 1mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); } else { pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); opte1 = pte1_load(pte1p); if (pte1_is_dirty(opte1)) { /* * Although "opte1" is mapping a 1MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((opte1 & PTE1_A) != 0) { /* * Since this reference bit is shared by 256 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual section number, and the pmap * address to select one 4KB page out of the 256 * on which testing the reference bit will result * in clearing that bit. This function is designed * to avoid the selection of the same 4KB page * for every 1MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the section is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && !pte1_is_wired(opte1)) { pte1_clear_bit(pte1p, PTE1_A); pmap_tlb_flush(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(pte1_is_link(pte1_load(pte1p)), ("%s: not found a link in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load(pte2p); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if ((opte2 & PTE2_A) != 0) { pte2_clear_bit(pte2p, PTE2_A); pmap_tlb_flush(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { if (!pte1_is_wired(pte1)) panic("%s: pte1 %#x not wired", __func__, pte1); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pte1_clear_bit(pte1p, PTE1_W); pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) panic("%s: demotion failed", __func__); #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (!pte2_is_wired(pte2)) panic("%s: pte2 %#x is missing PTE2_W", __func__, pte2); /* * PTE2_W must be cleared atomically. Although the pmap * lock synchronizes access to PTE2_W, another processor * could be changing PTE2_NM and/or PTE2_A concurrently. */ pte2_clear_bit(pte2p, PTE2_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); if (!(pte1_load(pte1p) & PTE1_RO)) (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load(pte2p); if (!(opte2 & PTE2_RO)) { pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, pte2; vm_offset_t pdnxt; vm_page_t m; boolean_t pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = pte1_trunc(sva + PTE1_SIZE); if (pdnxt < sva) pdnxt = eva; pte1p = pmap_pte1(pmap, sva); opte1 = pte1_load(pte1p); if (!pte1_is_valid(opte1)) /* XXX */ continue; else if (pte1_is_section(opte1)) { if (!pte1_is_managed(opte1)) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying L2 page * table is fully populated, this removal never * frees a L2 page table page. */ if (!pte1_is_wired(opte1)) { pte2p = pmap_pte2_quick(pmap, sva); KASSERT(pte2_is_valid(pte2_load(pte2p)), ("%s: invalid PTE2", __func__)); pmap_remove_pte2(pmap, pte2p, sva, NULL); } } if (pdnxt > eva) pdnxt = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) continue; else if (pte2_is_dirty(pte2)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); vm_page_dirty(m); } pte2_set_bit(pte2p, PTE2_NM); pte2_clear_bit(pte2p, PTE2_A); } else if ((pte2 & PTE2_A) != 0) pte2_clear_bit(pte2p, PTE2_A); else continue; pmap_tlb_flush(pmap, sva); } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("%s: page %p is exclusive busy", __func__, m)); /* * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM * cleared. If the object containing the page is locked and the page * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently * set. */ if ((m->flags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); opte1 = pte1_load(pte1p); if (!(opte1 & PTE1_RO)) { if (pmap_demote_pte1(pmap, pte1p, va) && !pte1_is_wired(opte1)) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); pte2p = pmap_pte2_quick(pmap, va); opte2 = pte2_load(pte2p); if ((opte2 & PTE2_V)) { pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); vm_page_dirty(m); pmap_tlb_flush(pmap, va); } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); if (pte2_is_dirty(pte2_load(pte2p))) { pte2_set_bit(pte2p, PTE2_NM); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { pt2_entry_t *cmap2_pte2p; vm_memattr_t oma; vm_paddr_t pa; struct pcpu *pc; oma = m->md.pat_mode; m->md.pat_mode = ma; CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); if ((m->flags & PG_FICTITIOUS) != 0) return; #if 0 /* * If "m" is a normal page, flush it from the cache. * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m, oma)) return; #endif /* * If page is not mapped by sf buffer, map the page * transient and do invalidation. */ if (ma != oma) { pa = VM_PAGE_TO_PHYS(m); sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_memattr_to_pte2(ma))); dcache_wbinv_poc((vm_offset_t)pc->pc_cmap2_addr, pa, PAGE_SIZE); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } } /* * Miscellaneous support routines follow */ /* * Returns TRUE if the given page is mapped individually or as part of * a 1mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); pagezero(pc->pc_cmap2_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap2_addr); else bzero(pc->pc_cmap2_addr + off, size); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap1_pte2p = pc->pc_cmap1_pte2p; cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap1_pte2p) != 0) panic("%s: CMAP1 busy", __func__); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), PTE2_AP_KRW, vm_page_pte2_attr(dst))); bcopy(pc->pc_cmap1_addr, pc->pc_cmap2_addr, PAGE_SIZE); pte2_clear(cmap1_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap1_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { pt2_entry_t *cmap1_pte2p, *cmap2_pte2p; vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap1_pte2p = pc->pc_cmap1_pte2p; cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap1_pte2p) != 0) panic("pmap_copy_pages: CMAP1 busy"); if (pte2_load(cmap2_pte2p) != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); pte2_store(cmap1_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); tlb_flush_local((vm_offset_t)pc->pc_cmap1_addr); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); tlb_flush_local((vm_offset_t)pc->pc_cmap2_addr); a_cp = pc->pc_cmap1_addr + a_pg_offset; b_cp = pc->pc_cmap2_addr + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } pte2_clear(cmap1_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap1_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } vm_offset_t pmap_quick_enter_page(vm_page_t m) { struct pcpu *pc; pt2_entry_t *pte2p; critical_enter(); pc = get_pcpu(); pte2p = pc->pc_qmap_pte2p; KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); return (pc->pc_qmap_addr); } void pmap_quick_remove_page(vm_offset_t addr) { struct pcpu *pc; pt2_entry_t *pte2p; pc = get_pcpu(); pte2p = pc->pc_qmap_pte2p; KASSERT(addr == pc->pc_qmap_addr, ("%s: invalid address", __func__)); KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); pte2_clear(pte2p); tlb_flush(pc->pc_qmap_addr); critical_exit(); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t nextva; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = nextva) { pt2_entry_t *src_pte2p, *dst_pte2p; vm_page_t dst_mpt2pg, src_mpt2pg; pt1_entry_t src_pte1; u_int pte1_idx; KASSERT(addr < VM_MAXUSER_ADDRESS, ("%s: invalid to pmap_copy page tables", __func__)); nextva = pte1_trunc(addr + PTE1_SIZE); if (nextva < addr) nextva = end_addr; pte1_idx = pte1_index(addr); src_pte1 = src_pmap->pm_pt1[pte1_idx]; if (pte1_is_section(src_pte1)) { if ((addr & PTE1_OFFSET) != 0 || (addr + PTE1_SIZE) > end_addr) continue; if (dst_pmap->pm_pt1[pte1_idx] == 0 && (!pte1_is_managed(src_pte1) || pmap_pv_insert_pte1(dst_pmap, addr, src_pte1, PMAP_ENTER_NORECLAIM))) { dst_pmap->pm_pt1[pte1_idx] = src_pte1 & ~PTE1_W; dst_pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } continue; } else if (!pte1_is_link(src_pte1)) continue; src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); /* * We leave PT2s to be linked from PT1 even if they are not * referenced until all PT2s in a page are without reference. * * QQQ: It could be changed ... */ #if 0 /* single_pt2_link_is_cleared */ KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, ("%s: source page table page is unused", __func__)); #else if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) continue; #endif if (nextva > end_addr) nextva = end_addr; src_pte2p = pt2map_entry(addr); while (addr < nextva) { pt2_entry_t temp_pte2; temp_pte2 = pte2_load(src_pte2p); /* * we only virtual copy managed pages */ if (pte2_is_managed(temp_pte2)) { dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dst_mpt2pg == NULL) goto out; dst_pte2p = pmap_pte2_quick(dst_pmap, addr); if (!pte2_is_valid(pte2_load(dst_pte2p)) && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ temp_pte2 &= ~(PTE2_W | PTE2_A); temp_pte2 |= PTE2_NM; pte2_store(dst_pte2p, temp_pte2); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_pt2(dst_pmap, addr, dst_mpt2pg, &free)) { pmap_tlb_flush(dst_pmap, addr); vm_page_free_pages_toq(&free, false); } goto out; } if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= pt2_wirecount_get(src_mpt2pg, pte1_idx)) break; } addr += PAGE_SIZE; src_pte2p++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more section mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t pte1_offset; if (size < PTE1_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); pte1_offset = offset & PTE1_OFFSET; if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || (*addr & PTE1_OFFSET) == pte1_offset) return; if ((*addr & PTE1_OFFSET) < pte1_offset) *addr = pte1_trunc(*addr) + pte1_offset; else *addr = pte1_roundup(*addr) + pte1_offset; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid, ttb; PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif ttb = pmap_ttb_get(pmap); /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_pagedir = ttb; cp15_ttbr_set(ttb); PCPU_SET(curpmap, pmap); critical_exit(); } /* * Perform the pmap work for mincore. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; vm_paddr_t pa; bool managed; int val; PMAP_LOCK(pmap); retry: pte1p = pmap_pte1(pmap, addr); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); managed = pte1_is_managed(pte1); val = MINCORE_SUPER | MINCORE_INCORE; if (pte1_is_dirty(pte1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte1 & PTE1_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, addr); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); pa = pte2_pa(pte2); managed = pte2_is_managed(pte2); val = MINCORE_INCORE; if (pte2_is_dirty(pte2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte2 & PTE2_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else { managed = false; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { vm_offset_t sva; uint32_t l2attr; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); while (size != 0) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { vm_offset_t sva; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; while (size != 0) { pmap_kremove(va); va += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) { pcb->pcb_pagedir = pmap_ttb_get(pmap); } /* * Clean L1 data cache range by physical address. * The range must be within a single page. */ static void pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) { pt2_entry_t *cmap2_pte2p; struct pcpu *pc; KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, ("%s: not on single page", __func__)); sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); dcache_wb_pou((vm_offset_t)pc->pc_cmap2_addr + (pa & PAGE_MASK), size); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Sync instruction cache range which is not mapped yet. */ void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { uint32_t len, offset; vm_page_t m; /* Write back d-cache on given address range. */ offset = pa & PAGE_MASK; for ( ; size != 0; size -= len, pa += len, offset = 0) { len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) { /* Write back d-cache on given address range. */ if (va >= VM_MIN_KERNEL_ADDRESS) { dcache_wb_pou(va, size); } else { uint32_t len, offset; vm_paddr_t pa; vm_page_t m; offset = va & PAGE_MASK; for ( ; size != 0; size -= len, va += len, offset = 0) { pa = pmap_extract(pmap, va); /* offset is preserved */ len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } /* * The implementation of pmap_fault() uses IN_RANGE2() macro which * depends on the fact that given range size is a power of 2. */ CTASSERT(powerof2(NB_IN_PT1)); CTASSERT(powerof2(PT2MAP_SIZE)); #define IN_RANGE2(addr, start, size) \ ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) /* * Handle access and R/W emulation faults. */ int pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; if (pmap == NULL) pmap = kernel_pmap; /* * In kernel, we should never get abort with FAR which is in range of * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here * and print out a useful abort message and even get to the debugger * otherwise it likely ends with never ending loop of aborts. */ if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { /* * All L1 tables should always be mapped and present. * However, we check only current one herein. For user mode, * only permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", __func__, pmap, pmap->pm_pt1, far); panic("%s: pm_pt1 abort", __func__); } return (KERN_INVALID_ADDRESS); } if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { /* * PT2MAP should be always mapped and present in current * L1 table. However, only existing L2 tables are mapped * in PT2MAP. For user mode, only L2 translation abort and * permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", __func__, pmap, PT2MAP, far); panic("%s: PT2MAP abort", __func__); } return (KERN_INVALID_ADDRESS); } /* * A pmap lock is used below for handling of access and R/W emulation * aborts. They were handled by atomic operations before so some * analysis of new situation is needed to answer the following question: * Is it safe to use the lock even for these aborts? * * There may happen two cases in general: * * (1) Aborts while the pmap lock is locked already - this should not * happen as pmap lock is not recursive. However, under pmap lock only * internal kernel data should be accessed and such data should be * mapped with A bit set and NM bit cleared. If double abort happens, * then a mapping of data which has caused it must be fixed. Further, * all new mappings are always made with A bit set and the bit can be * cleared only on managed mappings. * * (2) Aborts while another lock(s) is/are locked - this already can * happen. However, there is no difference here if it's either access or * R/W emulation abort, or if it's some other abort. */ PMAP_LOCK(pmap); #ifdef INVARIANTS pte1 = pte1_load(pmap_pte1(pmap, far)); if (pte1_is_link(pte1)) { /* * Check in advance that associated L2 page table is mapped into * PT2MAP space. Note that faulty access to not mapped L2 page * table is caught in more general check above where "far" is * checked that it does not lay in PT2MAP space. Note also that * L1 page table and PT2TAB always exist and are mapped. */ pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); if (!pte2_is_valid(pte2)) panic("%s: missing L2 page table (%p, %#x)", __func__, pmap, far); } #endif #ifdef SMP /* * Special treatment is due to break-before-make approach done when * pte1 is updated for userland mapping during section promotion or * demotion. If not caught here, pmap_enter() can find a section * mapping on faulting address. That is not allowed. */ if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } #endif /* * Accesss bits for page and section. Note that the entry * is not in TLB yet, so TLB flush is not necessary. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. */ if (idx == FAULT_ACCESS_L2) { pte1 = pte1_load(pmap_pte1(pmap, far)); if (pte1_is_link(pte1)) { /* L2 page table should exist and be mapped. */ pte2p = pt2map_entry(far); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { pte2_store(pte2p, pte2 | PTE2_A); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } else { /* * We got L2 access fault but PTE1 is not a link. * Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L2 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if (idx == FAULT_ACCESS_L1) { pte1p = pmap_pte1(pmap, far); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pte1_store(pte1p, pte1 | PTE1_A); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } else { /* * We got L1 access fault but PTE1 is not section * mapping. Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_ACCESS_L1 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * Handle modify bits for page and section. Note that the modify * bit is emulated by software. So PTEx_RO is software read only * bit and PTEx_NM flag is real hardware read only bit. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. */ if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { pte1 = pte1_load(pmap_pte1(pmap, far)); if (pte1_is_link(pte1)) { /* L2 page table should exist and be mapped. */ pte2p = pt2map_entry(far); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && (pte2 & PTE2_NM)) { pte2_store(pte2p, pte2 & ~PTE2_NM); tlb_flush(trunc_page(far)); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } else { /* * We got L2 permission fault but PTE1 is not a link. * Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_PERM_L2 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { pte1p = pmap_pte1(pmap, far); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { if (!(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { pte1_store(pte1p, pte1 & ~PTE1_NM); tlb_flush(pte1_trunc(far)); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } else { /* * We got L1 permission fault but PTE1 is not section * mapping. Probably some race happened, do nothing. */ CTR3(KTR_PMAP, "%s: FAULT_PERM_L1 - pmap %#x far %#x", __func__, pmap, far); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * QQQ: The previous code, mainly fast handling of access and * modify bits aborts, could be moved to ASM. Now we are * starting to deal with not fast aborts. */ PMAP_UNLOCK(pmap); return (KERN_FAILURE); } #if defined(PMAP_DEBUG) /* * Reusing of KVA used in pmap_zero_page function !!! */ static void pmap_zero_page_check(vm_page_t m) { pt2_entry_t *cmap2_pte2p; uint32_t *p, *end; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap2_pte2p = pc->pc_cmap2_pte2p; mtx_lock(&pc->pc_cmap_lock); if (pte2_load(cmap2_pte2p) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(cmap2_pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); end = (uint32_t*)(pc->pc_cmap2_addr + PAGE_SIZE); for (p = (uint32_t*)pc->pc_cmap2_addr; p < end; p++) if (*p != 0) panic("%s: page %p not zero, va: %p", __func__, m, pc->pc_cmap2_addr); pte2_clear(cmap2_pte2p); tlb_flush((vm_offset_t)pc->pc_cmap2_addr); sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte2 = 0; int i, j, index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid || p->p_vmspace == NULL) continue; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPTE1_IN_PT1; i++) { pt1_entry_t pte1; pt2_entry_t *pte2p, pte2; vm_offset_t base, va; vm_paddr_t pa; vm_page_t m; base = i << PTE1_SHIFT; pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1_is_section(pte1)) { /* * QQQ: Do something here! */ } else if (pte1_is_link(pte1)) { for (j = 0; j < NPTE2_IN_PT2; j++) { va = base + (j << PAGE_SHIFT); if (va >= VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte2); } pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (!pte2_is_valid(pte2)) continue; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pa: 0x%x, w: %d, " "f: 0x%x", va, pa, m->wire_count, m->flags); npte2++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } sx_sunlock(&allproc_lock); return (npte2); } #endif #ifdef DDB static pt2_entry_t * pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (!pte1_is_link(pte1)) return (NULL); if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR3); } #ifdef SMP else if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR3); } #endif return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } static void dump_pmap(pmap_t pmap) { printf("pmap %p\n", pmap); printf(" pm_pt1: %p\n", pmap->pm_pt1); printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); } DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) { pmap_t pmap; LIST_FOREACH(pmap, &allpmaps, pm_list) { dump_pmap(pmap); } } static int pte2_class(pt2_entry_t pte2) { int cls; cls = (pte2 >> 2) & 0x03; cls |= (pte2 >> 4) & 0x04; return (cls); } static void dump_section(pmap_t pmap, uint32_t pte1_idx) { } static void dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) { uint32_t i; vm_offset_t va; pt2_entry_t *pte2p, pte2; vm_page_t m; va = pte1_idx << PTE1_SHIFT; pte2p = pmap_pte2_ddb(pmap, va); for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (pte2 == 0) continue; if (!pte2_is_valid(pte2)) { printf(" 0x%08X: 0x%08X", va, pte2); if (!invalid_ok) printf(" - not valid !!!"); printf("\n"); continue; } m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); if (m != NULL) { printf(" v:%d w:%d f:0x%04X\n", m->valid, m->wire_count, m->flags); } else { printf("\n"); } } } static __inline boolean_t is_pv_chunk_space(vm_offset_t va) { if ((((vm_offset_t)pv_chunkbase) <= va) && (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) return (TRUE); return (FALSE); } DB_SHOW_COMMAND(pmap, pmap_pmap_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va, eva; vm_page_t m; uint32_t i; boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; if (have_addr) { pmap_t pm; LIST_FOREACH(pm, &allpmaps, pm_list) if (pm == pmap) break; if (pm == NULL) { printf("given pmap %p is not in allpmaps list\n", pmap); return; } } else pmap = PCPU_GET(curpmap); eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ printf("pmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); for(i = 0; i < NPTE1_IN_PT1; i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (va >= eva) break; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { dump_link_ok = TRUE; invalid_ok = FALSE; pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", va, pte1, pte2, m); if (is_pv_chunk_space(va)) { printf(" - pv_chunk space"); if (dump_pv_chunk) invalid_ok = TRUE; else dump_link_ok = FALSE; } else if (m != NULL) printf(" w:%d w2:%u", m->wire_count, pt2_wirecount_get(m, pte1_index(va))); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO"); else if (pte2_pa(pte1) != pte2_pa(pte2)) printf(" !!! pt2tab entry is DIFFERENT - m: %p", PHYS_TO_VM_PAGE(pte2_pa(pte2))); printf("\n"); if (dump_link_ok) dump_link(pmap, i, invalid_ok); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } } static void dump_pt2tab(pmap_t pmap) { uint32_t i; pt2_entry_t pte2; vm_offset_t va; vm_paddr_t pa; vm_page_t m; printf("PT2TAB:\n"); for (i = 0; i < PT2TAB_ENTRIES; i++) { pte2 = pte2_load(&pmap->pm_pt2tab[i]); if (!pte2_is_valid(pte2)) continue; va = i << PT2TAB_SHIFT; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, pte2_class(pte2), !!(pte2 & PTE2_S), m); if (m != NULL) printf(" , w: %d, f: 0x%04X pidx: %lld", m->wire_count, m->flags, m->pindex); printf("\n"); } } DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va; uint32_t i, start; if (have_addr) { printf("supported only on current pmap\n"); return; } pmap = PCPU_GET(curpmap); printf("curpmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); start = pte1_index((vm_offset_t)PT2MAP); for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, !!(pte1 & PTE1_S)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, pte1, pte2); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO\n"); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } dump_pt2tab(pmap); } #endif Index: head/sys/arm64/arm64/pmap.c =================================================================== --- head/sys/arm64/arm64/pmap.c (revision 352406) +++ head/sys/arm64/arm64/pmap.c (revision 352407) @@ -1,5941 +1,5941 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * The presence of this flag indicates that the mapping is writeable. * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is * dirty. This flag may only be set on managed mappings. * * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it * as a software managed bit. */ #define ATTR_SW_DBM ATTR_DBM struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ #define PMAP_PREINIT_MAPPING_COUNT 32 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ static int vm_initialized = 0; /* No need to use pre-init maps when set */ /* * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. * Always map entire L2 block for simplicity. * VA of L2 block = preinit_map_va + i * L2_SIZE */ static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t size; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; /* * Data for the pv entry allocation mechanism. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) static vm_paddr_t physmap[PHYSMAP_SIZE]; static u_int physmap_idx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) #define pmap_load(table) (*table) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set_bits(table, bits) atomic_set_64(table, bits) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) return (false); *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); /* * Checks if the PTE is dirty. */ static inline int pmap_pte_dirty(pt_entry_t pte) { KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0, ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static vm_offset_t pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_offset_t freemempos) { pt_entry_t *l2; vm_offset_t va; vm_paddr_t l2_pa, pa; u_int l1_slot, l2_slot, prev_l1_slot; int i; dmap_phys_base = min_pa & ~L1_OFFSET; dmap_phys_max = 0; dmap_max_addr = 0; l2 = NULL; prev_l1_slot = -1; #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); for (i = 0; i < (physmap_idx * 2); i += 2) { pa = physmap[i] & ~L2_OFFSET; va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; /* Create L2 mappings at the start of the region */ if ((pa & L1_OFFSET) != 0) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { /* * We are on a boundary, stop to * create a level 1 block */ if ((pa & L1_OFFSET) == 0) break; l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), ("...")); } for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && (physmap[i + 1] - pa) >= L1_SIZE; pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Create L2 mappings at the end of the region */ if (pa < physmap[i + 1]) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } } if (pa > dmap_phys_max) { dmap_phys_max = pa; dmap_max_addr = va; } } cpu_tlb_flushID(); return (freemempos); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; uint64_t kern_delta; int i; kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; physmap_idx = arm_physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; } freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create a direct map region early so we can use it for pa -> va */ freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); va = KERNBASE; start_pa = pa = KERNBASE - kern_delta; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; /* Reserve some VA space for early BIOS/ACPI mapping */ preinit_map_va = roundup2(freemempos, L2_SIZE); virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; virtual_avail = roundup2(virtual_avail, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L2_SIZE; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); vm_initialized = 1; } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } static __inline void pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { sched_pin(); pmap_invalidate_range_nopin(pmap, sva, eva); sched_unpin(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_page_t m; int lvl; m = NULL; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { switch(lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); pte = pmap_l1(kernel_pmap, va); if (pte == NULL) return (0); /* * A concurrent pmap_update_entry() will clear the entry's valid bit * but leave the rest of the entry unchanged. Therefore, we treat a * non-zero entry as being valid, and we ignore the valid bit when * determining whether the entry maps a block, page, or table. */ tpte = pmap_load(pte); if (tpte == 0) return (0); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) return ((tpte & ~ATTR_MASK) | (va & L1_OFFSET)); pte = pmap_l1_to_l2(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (0); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) return ((tpte & ~ATTR_MASK) | (va & L2_OFFSET)); pte = pmap_l2_to_l3(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (0); return ((tpte & ~ATTR_MASK) | (va & L3_OFFSET)); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; if (mode == DEVICE_MEMORY) attr |= ATTR_XN; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; if (m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; pmap->pm_root.rt_root = 0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Because of AArch64's weak memory consistency model, we must have a * barrier here to ensure that the stores for zeroing "m", whether by * pmap_zero_page() or an earlier function, are visible before adding * "m" to the page table. Otherwise, a page table walk by another * processor's MMU could see the mapping to "m" and a stale, non-zero * PTE within "m". */ dmb(ishst); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); l2pg->wire_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); vm_page_unwire_noq(m); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if (pmap_load(l2) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l2, paddr | L2_TABLE); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pv_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_demote_l2: va is not 2mpage aligned")); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = l2e & ~ATTR_MASK; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | L2_TABLE; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the * mapping (i.e., the "break" in break-before-make). */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); va < eva; va += PAGE_SIZE, m++) { if (pmap_pte_dirty(old_l2)) vm_page_dirty(m); if (old_l2 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_l2: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the specified range of addresses from the L3 page table that is * identified by the given L2 entry. */ static void pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, vm_offset_t eva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; struct rwlock *new_lock; pt_entry_t *l3, old_l3; vm_offset_t va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), ("pmap_remove_l3_range: range crosses an L3 page table boundary")); va = eva; for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { if (!pmap_l3_valid(pmap_load(l3))) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } old_l3 = pmap_load_clear(l3); if ((old_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; pmap_resident_count_dec(pmap, 1); if ((old_l3 & ATTR_SW_MANAGED) != 0) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if ((old_l3 & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); if (new_lock != *lockp) { if (*lockp != NULL) { /* * Pending TLB invalidations must be * performed before the PV list lock is * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } rw_wunlock(*lockp); } *lockp = new_lock; rw_wlock(*lockp); } pmap_pvh_free(&m->md, pmap, sva); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (va == eva) va = sva; if (pmap_unuse_pt(pmap, sva, l2e, free)) { sva += L3_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, &lock); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_pte_dirty(tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_protect_l2: sva is not 2mpage aligned")); old_l2 = pmap_load(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); /* * Return if the L2 entry already has the desired access restrictions * in place. */ retry: if ((old_l2 & mask) == nbits) return; /* * When a dirty read/write superpage mapping is write protected, * update the dirty field of each of the superpage's constituent 4KB * pages. */ if ((old_l2 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) goto retry; /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, mask, nbits; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } mask = nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_AP(ATTR_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { mask |= ATTR_XN; nbits |= ATTR_XN; } if (mask == 0) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); retry: /* * Go to the next L3 entry if the current one is * invalid or already has the desired access * restrictions in place. (The latter case occurs * frequently. For example, in a "buildworld" * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } /* * When a dirty read/write mapping is write protected, * update the page's dirty field. */ if ((l3 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) goto retry; if (va == va_next) va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); critical_enter(); /* * Clear the old mapping's valid bit, but leave the rest of the entry * unchanged, so that a lockless, concurrent pmap_kextract() can still * lookup the physical address. */ pmap_clear_bits(pte, ATTR_DESCR_VALID); pmap_invalidate_range_nopin(pmap, va, va + size); /* Create the new mapping */ pmap_store(pte, newpte); dsb(ishst); critical_exit(); intr_restore(intr); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); setl2: if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM)) goto setl2; newl2 &= ~ATTR_SW_DBM; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); setl3: if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & ~ATTR_SW_DBM)) goto setl3; oldl3 &= ~ATTR_SW_DBM; } if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; int lvl, rv; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l3 |= ATTR_XN; if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (va < VM_MAXUSER_ADDRESS) new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; if ((prot & VM_PROT_WRITE) != 0) { new_l3 |= ATTR_SW_DBM; if ((flags & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); } } CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } /* We need to allocate an L3 table. */ } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; /* * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order * to handle the possibility that a superpage mapping for "va" * was created while we slept. */ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: missing L3 table for kernel va %#lx", va); havel3: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; pv = NULL; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0 && (new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. */ orig_l3 = pmap_load_clear(l3); KASSERT((orig_l3 & ~ATTR_MASK) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if (pmap_pte_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, then other can * access this page before caches are properly synced. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && (opa != pa || (orig_l3 & ATTR_XN))) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); /* * Update the L3 entry */ if (pmap_l3_valid(orig_l3)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ /* XXXMJ need to reload orig_l3 for hardware DBM. */ pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if ((orig_l3 & ATTR_SW_MANAGED) != 0 && pmap_pte_dirty(orig_l3)) vm_page_dirty(m); } else { /* * orig_l3 == new_l3 * This can happens if multiple threads simultaneously * access not yet mapped page. This bad for performance * since this can cause full demotion-NOP-promotion * cycle. * Another possible reasons are: * - VM and pmap memory layout are diverged * - tlb flush is missing somewhere and CPU doesn't see * actual mapping. */ CTR4(KTR_PMAP, "%s: already mapped page - " "pmap %p va 0x%#lx pte 0x%lx", __func__, pmap, va, new_l3); } } else { /* New mapping */ pmap_store(l3, new_l3); dsb(ishst); } #if VM_NRESERVLEVEL > 0 if ((mpte == NULL || mpte->wire_count == NL3PG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; PMAP_LOCK_ASSERT(pmap, MA_OWNED); new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; } if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l2 |= ATTR_XN; if (va < VM_MAXUSER_ADDRESS) new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, old_l2; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((old_l2 = pmap_load(l2)) != 0) { KASSERT(l2pg->wire_count > 1, ("pmap_enter_l2: l2pg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3_range() * will leave the kernel page table page zero filled. * Nonetheless, the TLB could have an intermediate * entry for the kernel page table page. */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); pmap_clear(l2); pmap_invalidate_page(pmap, va); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & ATTR_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, the TLB could * nonetheless have intermediate entries that * refer to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation (See * _pmap_unwire_l3().) */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & ATTR_SW_DBM) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); dsb(ishst); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l2, *l3, l3_val; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } /* * Abort if a mapping already exists. */ if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) l3_val |= ATTR_XN; else if (va < VM_MAXUSER_ADDRESS) l3_val |= ATTR_PXN; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) { l3_val |= ATTR_SW_MANAGED; l3_val &= ~ATTR_AF; } /* Sync icache before the mapping is stored to PTE */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); pmap_store(l3, l3_val); dsb(ishst); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_clear_bits(l2, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) panic("pmap_unwire: demotion failed"); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * ATTR_SW_WIRED must be cleared atomically. Although * the pmap lock synchronizes access to ATTR_SW_WIRED, * the System MMU may write to the entry concurrently. */ pmap_clear_bits(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. * * Because the executable mappings created by this routine are copied, * it should not have to flush the instruction cache. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pd_entry_t *l0, *l1, *l2, srcptepaddr; pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_l2pg, dstmpte, srcmpte; if (dst_addr != src_addr) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { l0 = pmap_l0(src_pmap, addr); if (pmap_load(l0) == 0) { va_next = (addr + L0_SIZE) & ~L0_OFFSET; if (va_next < addr) va_next = end_addr; continue; } l1 = pmap_l0_to_l1(l0, addr); if (pmap_load(l1) == 0) { va_next = (addr + L1_SIZE) & ~L1_OFFSET; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L2_SIZE) & ~L2_OFFSET; if (va_next < addr) va_next = end_addr; l2 = pmap_l1_to_l2(l1, addr); srcptepaddr = pmap_load(l2); if (srcptepaddr == 0) continue; if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { if ((addr & L2_OFFSET) != 0 || addr + L2_SIZE > end_addr) continue; dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL); if (dst_l2pg == NULL) break; l2 = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg)); l2 = &l2[pmap_l2_index(addr)]; if (pmap_load(l2) == 0 && ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((srcptepaddr & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(l2, (srcptepaddr & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else dst_l2pg->wire_count--; continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_l3_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = pmap_load(src_pte); /* * We only virtual copy managed pages. */ if ((ptetemp & ATTR_SW_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_l3_index(addr)]; if (pmap_load(dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((ptetemp & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(dst_pte, (ptetemp & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_l3(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * the TLB could nonetheless have * intermediate entries that refer * to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: /* * XXX This barrier may not be needed because the destination pmap is * not active. */ dsb(ishst); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); tpte = pmap_load(pte); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (pmap_pte_dirty(tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) - if (vm_page_aflags(mt) & PGA_WRITEABLE) != 0 && + if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - if (vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); oldpte = pmap_load(pte); retry: if ((oldpte & ATTR_SW_DBM) != 0) { if (!atomic_fcmpset_long(pte, &oldpte, (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM)) goto retry; if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; vm_paddr_t pa; int cleared, lvl, md_gen, not_cleared, pvh_gen; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual superpage number, and the pmap * address to select one 4KB page out of the 512 on * which testing the reference bit will result in * clearing that reference bit. This function is * designed to avoid the selection of the same 4KB page * for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if ((tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; vm_offset_t va, va_next; vm_page_t m; pd_entry_t *l0, *l1, *l2, oldl2; pt_entry_t *l3, oldl3; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); oldl2 = pmap_load(l2); if (oldl2 == 0) continue; if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { if ((oldl2 & ATTR_SW_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The 2MB page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldl2 & ATTR_SW_WIRED) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); l3 = pmap_l2_to_l3(l2, va); KASSERT(pmap_load(l3) != 0, ("pmap_advise: invalid PTE")); pmap_remove_l3(pmap, l3, va, pmap_load(l2), NULL, &lock); } if (lock != NULL) rw_wunlock(lock); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_advise: invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { oldl3 = pmap_load(l3); if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != (ATTR_SW_MANAGED | L3_PAGE)) goto maybe_invlrng; else if (pmap_pte_dirty(oldl3)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); vm_page_dirty(m); } while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO))) cpu_spinwait(); } else if ((oldl3 & ATTR_AF) != 0) pmap_clear_bits(l3, ATTR_AF); else goto maybe_invlrng; if (va == va_next) va = sva; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3, oldl3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM * set. If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ if ((oldl2 & ATTR_SW_DBM) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & ATTR_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); l3 = pmap_l2_to_l3(l2, va); oldl3 = pmap_load(l3); while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO))) cpu_spinwait(); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); if (pmap_l3_valid(oldl3) && (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, free_l2_count, start_idx; if (!vm_initialized) { /* * No L3 ptables so map entire L2 blocks where start VA is: * preinit_map_va + start_idx * L2_SIZE * There may be duplicate mappings (multiple VA -> same PA) but * ARM64 dcache is always PIPT so that's acceptable. */ if (size == 0) return (NULL); /* Calculate how many L2 blocks are needed for the mapping */ l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT; offset = pa & L2_OFFSET; if (preinit_map_va == 0) return (NULL); /* Map 2MiB L2 blocks from reserved VA space */ free_l2_count = 0; start_idx = -1; /* Find enough free contiguous VA space */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (free_l2_count > 0 && ppim->pa != 0) { /* Not enough space here */ free_l2_count = 0; start_idx = -1; continue; } if (ppim->pa == 0) { /* Free L2 block */ if (start_idx == -1) start_idx = i; free_l2_count++; if (free_l2_count == l2_blocks) break; } } if (free_l2_count != l2_blocks) panic("%s: too many preinit mappings", __func__); va = preinit_map_va + (start_idx * L2_SIZE); for (i = start_idx; i < start_idx + l2_blocks; i++) { /* Mark entries as allocated */ ppim = pmap_preinit_mapping + i; ppim->pa = pa; ppim->va = va + offset; ppim->size = size; } /* Map L2 blocks */ pa = rounddown2(pa, L2_SIZE); for (i = 0; i < l2_blocks; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_mapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl)); /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, pa | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; } pmap_invalidate_all(kernel_pmap); va = preinit_map_va + (start_idx * L2_SIZE); } else { /* kva_alloc may be used to map the pages */ offset = pa & PAGE_MASK; size = round_page(offset + size); va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); /* L3 table is linked */ va = trunc_page(va); pa = trunc_page(pa); pmap_kenter(va, size, pa, CACHED_MEMORY); } return ((void *)(va + offset)); } void pmap_unmapbios(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset, tmpsize, va_trunc; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, block; bool preinit_map; l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); /* Remove preinit mapping */ preinit_map = false; block = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va) { KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch")); ppim->va = 0; ppim->pa = 0; ppim->size = 0; preinit_map = true; offset = block * L2_SIZE; va_trunc = rounddown2(va, L2_SIZE) + offset; /* Remove L2_BLOCK */ pde = pmap_pde(kernel_pmap, va_trunc, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc)); l2 = pmap_l1_to_l2(pde, va_trunc); pmap_clear(l2); if (block == (l2_blocks - 1)) break; block++; } } if (preinit_map) { pmap_invalidate_all(kernel_pmap); return; } /* Unmap the pages reserved with kva_alloc. */ if (vm_initialized) { offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); /* Unmap and invalidate the pages */ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kremove(va + tmpsize); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, tmpva, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_IDX_MASK; l3 |= ATTR_IDX(mode); if (mode == DEVICE_MEMORY) l3 |= ATTR_XN; pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } static void pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) { pt_entry_t *l3; for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { *l3 = newl3; newl3 += L3_SIZE; } } static void pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct rwlock **lockp) { struct spglist free; SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); va &= ~L2_OFFSET; tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldl2 & ATTR_AF) == 0) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", va, pmap); goto fail; } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), ("pmap_demote_l2: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (ml3 == NULL) { pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) { ml3->wire_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM), ("pmap_demote_l2: L2 entry is writeable but not dirty")); /* * If the page table page is not leftover from an earlier promotion, * or the mapping attributes have changed, (re)initialize the L3 table. * * When pmap_update_entry() clears the old L2 mapping, it (indirectly) * performs a dsb(). That dsb() ensures that the stores for filling * "l3" are visible before "l3" is added to the page table. */ if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) pmap_fill_l3(l3, newl3); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Pass PAGE_SIZE so that a single TLB invalidation is performed on * the 2MB page mapping. */ pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; int lvl, val; bool managed; PMAP_LOCK(pmap); retry: val = 0; pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL) { tpte = pmap_load(pte); switch (lvl) { case 3: mask = L3_OFFSET; break; case 2: mask = L2_OFFSET; break; case 1: mask = L1_OFFSET; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; val = MINCORE_INCORE; if (lvl != 3) val |= MINCORE_SUPER; if ((managed && pmap_pte_dirty(tpte)) || (!managed && (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; pa = (tpte & ~ATTR_MASK) | (addr & mask); } else managed = false; if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0); __asm __volatile( "msr ttbr0_el1, %0 \n" "isb \n" : : "r"(td->td_proc->p_md.md_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } struct pcb * pmap_switch(struct thread *old, struct thread *new) { pcpu_bp_harden bp_harden; struct pcb *pcb; /* Store the new curthread */ PCPU_SET(curthread, new); /* And the new pcb */ pcb = new->td_pcb; PCPU_SET(curpcb, pcb); /* * TODO: We may need to flush the cache here if switching * to a user process. */ if (old == NULL || old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) { __asm __volatile( /* Switch to the new pmap */ "msr ttbr0_el1, %0 \n" "isb \n" /* Invalidate the TLB */ "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n" : : "r"(new->td_proc->p_md.md_l0addr)); /* * Stop userspace from training the branch predictor against * other processes. This will call into a CPU specific * function that clears the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } return (pcb); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pt_entry_t pte, *ptep; register_t intr; uint64_t ec, par; int lvl, rv; rv = KERN_FAILURE; ec = ESR_ELx_EXCEPTION(esr); switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (rv); } /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL) { pmap_set_bits(ptep, ATTR_AF); rv = KERN_SUCCESS; /* * XXXMJ as an optimization we could mark the entry * dirty if this is a write fault. */ } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || (esr & ISS_DATA_WnR) == 0) return (rv); PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL && ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { if ((pte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RO)) { pmap_clear_bits(ptep, ATTR_AP_RW_BIT); pmap_invalidate_page(pmap, far); } rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: PMAP_LOCK(pmap); /* Ask the MMU to check the address */ intr = intr_disable(); if (pmap == kernel_pmap) par = arm64_address_translate_s1e1r(far); else par = arm64_address_translate_s1e0r(far); intr_restore(intr); PMAP_UNLOCK(pmap); /* * If the translation was successful the address was invalid * due to a break-before-make sequence. We can unlock and * return success to the trap handler. */ if (PAR_SUCCESS(par)) rv = KERN_SUCCESS; break; } return (rv); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 352406) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 352407) @@ -1,2701 +1,2705 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); /* * Tunable to control percentage of dirtied blocks from frees in one TXG. * After this threshold is crossed, additional dirty blocks from frees * wait until the next TXG. * A value of zero will disable this throttle. */ uint32_t zfs_per_txg_dirty_frees_percent = 30; SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); /* * This can be used for testing, to ensure that certain actions happen * while in the middle of a remap (which might otherwise complete too * quickly). */ int zfs_object_remap_one_indirect_delay_ticks = 0; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); #if defined(_KERNEL) && defined(RACCT) if (racct_enable && !read) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_WRITEBPS, length); racct_add_force(curproc, RACCT_WRITEIOPS, nblks); PROC_UNLOCK(curproc); } #endif zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); #ifdef _KERNEL else curthread->td_ru.ru_oublock++; #endif dbp[i] = &db->db; } if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, read && DNODE_IS_CACHEABLE(dn)); } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not in * cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); if (*start - minimum <= iblkrange * maxblks) { *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ /*ARGSUSED*/ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 4; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t long_free_dirty_all_txgs = 0; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; mutex_enter(&dp->dp_lock); for (int t = 0; t < TXG_SIZE; t++) { long_free_dirty_all_txgs += dp->dp_long_free_dirty_pertxg[t]; } mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees */ if (dirty_frees_threshold != 0 && long_free_dirty_all_txgs >= dirty_frees_threshold) { txg_wait_open(dp, 0); continue; } tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += chunk_len; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, uint64_t, dmu_tx_get_txg(tx)); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); bcopy((char *)db->db_data + bufoff, buf, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); bcopy(buf, (char *)db->db_data + bufoff, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } static int dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, uint64_t last_removal_txg, uint64_t offset) { uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); int err = 0; rw_enter(&dn->dn_struct_rwlock, RW_READER); dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); ASSERT3P(dbuf, !=, NULL); /* * If the block hasn't been written yet, this default will ensure * we don't try to remap it. */ uint64_t birth = UINT64_MAX; ASSERT3U(last_removal_txg, !=, UINT64_MAX); if (dbuf->db_blkptr != NULL) birth = dbuf->db_blkptr->blk_birth; rw_exit(&dn->dn_struct_rwlock); /* * If this L1 was already written after the last removal, then we've * already tried to remap it. */ if (birth <= last_removal_txg && dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && dbuf_can_remap(dbuf)) { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { (void) dbuf_dirty(dbuf, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } } dbuf_rele(dbuf, FTAG); delay(zfs_object_remap_one_indirect_delay_ticks); return (err); } /* * Remap all blockpointers in the object, if possible, so that they reference * only concrete vdevs. * * To do this, iterate over the L0 blockpointers and remap any that reference * an indirect vdev. Note that we only examine L0 blockpointers; since we * cannot guarantee that we can remap all blockpointer anyways (due to split * blocks), we do not want to make the code unnecessarily complicated to * catch the unlikely case that there is an L1 block on an indirect vdev that * contains no indirect blockpointers. */ int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t last_removal_txg) { uint64_t offset, l1span; int err; dnode_t *dn; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { return (err); } if (dn->dn_nlevels <= 1) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); } /* * If the dnode has no indirect blocks, we cannot dirty them. * We still want to remap the blkptr(s) in the dnode if * appropriate, so mark it as dirty. */ if (err == 0 && dnode_needs_remap(dn)) { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, dn->dn_object); if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { dnode_setdirty(dn, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } } dnode_rele(dn, FTAG); return (err); } offset = 0; l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + dn->dn_datablkshift); /* * Find the next L1 indirect that is not a hole. */ while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } if ((err = dmu_object_remap_one_indirect(os, dn, last_removal_txg, offset)) != 0) { break; } offset += l1span; } dnode_rele(dn, FTAG); return (err); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_lsize(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied(void) { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy(void) { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { #ifdef illumos err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); #else err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); #endif } if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); #ifdef illumos /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); #else err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); #endif if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #else /* !illumos */ int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, vm_page_t *ma, dmu_tx_t *tx) { dmu_buf_t **dbp; struct sf_buf *sf; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(*ma, &sf); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(sf); ma += 1; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, int last_size) { struct sf_buf *sf; vm_object_t vmobj; vm_page_t m; dmu_buf_t **dbp; dmu_buf_t *db; caddr_t va; int numbufs, i; int bufoff, pgoff, tocpy; int mi, di; int err; ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); ASSERT(last_size <= PAGE_SIZE); err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); if (err != 0) return (err); #ifdef DEBUG IMPLY(last_size < PAGE_SIZE, *rahead == 0); if (dbp[0]->db_offset != 0 || numbufs > 1) { for (i = 0; i < numbufs; i++) { ASSERT(ISP2(dbp[i]->db_size)); ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); } } #endif vmobj = ma[0]->object; zfs_vmobject_wlock(vmobj); db = dbp[0]; for (i = 0; i < *rbehind; i++) { m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i, VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); if (m == NULL) break; if (m->valid != 0) { ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); break; } ASSERT(m->dirty == 0); ASSERT(!pmap_page_is_mapped(m)); ASSERT(db->db_size > PAGE_SIZE); bufoff = IDX_TO_OFF(m->pindex) % db->db_size; va = zfs_map_page(m, &sf); bcopy((char *)db->db_data + bufoff, va, PAGESIZE); zfs_unmap_page(sf); m->valid = VM_PAGE_BITS_ALL; + vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); + vm_page_unlock(m); } *rbehind = i; bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; pgoff = 0; for (mi = 0, di = 0; mi < count && di < numbufs; ) { if (pgoff == 0) { m = ma[mi]; if (m != bogus_page) { vm_page_assert_xbusied(m); ASSERT(m->valid == 0); ASSERT(m->dirty == 0); ASSERT(!pmap_page_is_mapped(m)); va = zfs_map_page(m, &sf); } } if (bufoff == 0) db = dbp[di]; if (m != bogus_page) { ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, db->db_offset + bufoff); } /* * We do not need to clamp the copy size by the file * size as the last block is zero-filled beyond the * end of file anyway. */ tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); if (m != bogus_page) bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); pgoff += tocpy; ASSERT(pgoff <= PAGESIZE); if (pgoff == PAGESIZE) { if (m != bogus_page) { zfs_unmap_page(sf); m->valid = VM_PAGE_BITS_ALL; } ASSERT(mi < count); mi++; pgoff = 0; } bufoff += tocpy; ASSERT(bufoff <= db->db_size); if (bufoff == db->db_size) { ASSERT(di < numbufs); di++; bufoff = 0; } } #ifdef DEBUG /* * Three possibilities: * - last requested page ends at a buffer boundary and , thus, * all pages and buffers have been iterated; * - all requested pages are filled, but the last buffer * has not been exhausted; * the read-ahead is possible only in this case; * - all buffers have been read, but the last page has not been * fully filled; * this is only possible if the file has only a single buffer * with a size that is not a multiple of the page size. */ if (mi == count) { ASSERT(di >= numbufs - 1); IMPLY(*rahead != 0, di == numbufs - 1); IMPLY(*rahead != 0, bufoff != 0); ASSERT(pgoff == 0); } if (di == numbufs) { ASSERT(mi >= count - 1); ASSERT(*rahead == 0); IMPLY(pgoff == 0, mi == count); if (pgoff != 0) { ASSERT(mi == count - 1); ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); } } #endif if (pgoff != 0) { ASSERT(m != bogus_page); bzero(va + pgoff, PAGESIZE - pgoff); zfs_unmap_page(sf); m->valid = VM_PAGE_BITS_ALL; } for (i = 0; i < *rahead; i++) { m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i, VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY); if (m == NULL) break; if (m->valid != 0) { ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); break; } ASSERT(m->dirty == 0); ASSERT(!pmap_page_is_mapped(m)); ASSERT(db->db_size > PAGE_SIZE); bufoff = IDX_TO_OFF(m->pindex) % db->db_size; tocpy = MIN(db->db_size - bufoff, PAGESIZE); va = zfs_map_page(m, &sf); bcopy((char *)db->db_data + bufoff, va, tocpy); if (tocpy < PAGESIZE) { ASSERT(i == *rahead - 1); ASSERT((db->db_size & PAGE_MASK) != 0); bzero(va + tocpy, PAGESIZE - tocpy); } zfs_unmap_page(sf); m->valid = VM_PAGE_BITS_ALL; + vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); + vm_page_unlock(m); } *rahead = i; zfs_vmobject_wunlock(vmobj); dmu_buf_rele_array(dbp, numbufs, FTAG); return (0); } #endif /* illumos */ #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { #ifdef _KERNEL curthread->td_ru.ru_oublock++; #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_WRITEBPS, blksz); racct_add_force(curproc, RACCT_WRITEIOPS, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ #endif /* _KERNEL */ dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); os = dn->dn_objset; object = dn->dn_object; dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; blkptr_t *bp_orig = &zio->io_bp_orig; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { if (zfs_mdcomp_disable) { compress = ZIO_COMPRESS_EMPTY; } else { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); } /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_NOPARITY; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; zp->zp_compress = compress; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int err; /* * Sync any current changes before * we go trundling through the block pointers. */ err = dmu_object_wait_synced(os, object); if (err) { return (err); } err = dnode_hold(os, object, FTAG, &dn); if (err) { return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); } /* * Given the ZFS object, if it contains any dirty nodes * this function flushes all dirty blocks to disk. This * ensures the DMU object info is updated. A more efficient * future version might just find the TXG with the maximum * ID and wait for that to be synced. */ int dmu_object_wait_synced(objset_t *os, uint64_t object) { dnode_t *dn; int error, i; error = dnode_hold(os, object, FTAG, &dn); if (error) { return (error); } for (i = 0; i < TXG_SIZE; i++) { if (list_link_active(&dn->dn_dirty_link[i])) { break; } } dnode_rele(dn, FTAG); if (i != TXG_SIZE) { txg_wait_synced(dmu_objset_pool(os), 0); } return (0); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); zio_compress_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); zio_compress_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } Index: head/sys/dev/virtio/balloon/virtio_balloon.c =================================================================== --- head/sys/dev/virtio/balloon/virtio_balloon.c (revision 352406) +++ head/sys/dev/virtio/balloon/virtio_balloon.c (revision 352407) @@ -1,562 +1,564 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for VirtIO memory balloon devices. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_if.h" struct vtballoon_softc { device_t vtballoon_dev; struct mtx vtballoon_mtx; uint64_t vtballoon_features; uint32_t vtballoon_flags; #define VTBALLOON_FLAG_DETACH 0x01 struct virtqueue *vtballoon_inflate_vq; struct virtqueue *vtballoon_deflate_vq; uint32_t vtballoon_desired_npages; uint32_t vtballoon_current_npages; TAILQ_HEAD(,vm_page) vtballoon_pages; struct thread *vtballoon_td; uint32_t *vtballoon_page_frames; int vtballoon_timeout; }; static struct virtio_feature_desc vtballoon_feature_desc[] = { { VIRTIO_BALLOON_F_MUST_TELL_HOST, "MustTellHost" }, { VIRTIO_BALLOON_F_STATS_VQ, "StatsVq" }, { 0, NULL } }; static int vtballoon_probe(device_t); static int vtballoon_attach(device_t); static int vtballoon_detach(device_t); static int vtballoon_config_change(device_t); static void vtballoon_negotiate_features(struct vtballoon_softc *); static int vtballoon_alloc_virtqueues(struct vtballoon_softc *); static void vtballoon_vq_intr(void *); static void vtballoon_inflate(struct vtballoon_softc *, int); static void vtballoon_deflate(struct vtballoon_softc *, int); static void vtballoon_send_page_frames(struct vtballoon_softc *, struct virtqueue *, int); static void vtballoon_pop(struct vtballoon_softc *); static void vtballoon_stop(struct vtballoon_softc *); static vm_page_t vtballoon_alloc_page(struct vtballoon_softc *); static void vtballoon_free_page(struct vtballoon_softc *, vm_page_t); static int vtballoon_sleep(struct vtballoon_softc *); static void vtballoon_thread(void *); static void vtballoon_add_sysctl(struct vtballoon_softc *); /* Features desired/implemented by this driver. */ #define VTBALLOON_FEATURES 0 /* Timeout between retries when the balloon needs inflating. */ #define VTBALLOON_LOWMEM_TIMEOUT hz /* * Maximum number of pages we'll request to inflate or deflate * the balloon in one virtqueue request. Both Linux and NetBSD * have settled on 256, doing up to 1MB at a time. */ #define VTBALLOON_PAGES_PER_REQUEST 256 /* Must be able to fix all pages frames in one page (segment). */ CTASSERT(VTBALLOON_PAGES_PER_REQUEST * sizeof(uint32_t) <= PAGE_SIZE); #define VTBALLOON_MTX(_sc) &(_sc)->vtballoon_mtx #define VTBALLOON_LOCK_INIT(_sc, _name) mtx_init(VTBALLOON_MTX((_sc)), _name, \ "VirtIO Balloon Lock", MTX_DEF) #define VTBALLOON_LOCK(_sc) mtx_lock(VTBALLOON_MTX((_sc))) #define VTBALLOON_UNLOCK(_sc) mtx_unlock(VTBALLOON_MTX((_sc))) #define VTBALLOON_LOCK_DESTROY(_sc) mtx_destroy(VTBALLOON_MTX((_sc))) static device_method_t vtballoon_methods[] = { /* Device methods. */ DEVMETHOD(device_probe, vtballoon_probe), DEVMETHOD(device_attach, vtballoon_attach), DEVMETHOD(device_detach, vtballoon_detach), /* VirtIO methods. */ DEVMETHOD(virtio_config_change, vtballoon_config_change), DEVMETHOD_END }; static driver_t vtballoon_driver = { "vtballoon", vtballoon_methods, sizeof(struct vtballoon_softc) }; static devclass_t vtballoon_devclass; DRIVER_MODULE(virtio_balloon, virtio_pci, vtballoon_driver, vtballoon_devclass, 0, 0); MODULE_VERSION(virtio_balloon, 1); MODULE_DEPEND(virtio_balloon, virtio, 1, 1, 1); VIRTIO_SIMPLE_PNPTABLE(virtio_balloon, VIRTIO_ID_BALLOON, "VirtIO Balloon Adapter"); VIRTIO_SIMPLE_PNPINFO(virtio_pci, virtio_balloon); static int vtballoon_probe(device_t dev) { return (VIRTIO_SIMPLE_PROBE(dev, virtio_balloon)); } static int vtballoon_attach(device_t dev) { struct vtballoon_softc *sc; int error; sc = device_get_softc(dev); sc->vtballoon_dev = dev; VTBALLOON_LOCK_INIT(sc, device_get_nameunit(dev)); TAILQ_INIT(&sc->vtballoon_pages); vtballoon_add_sysctl(sc); virtio_set_feature_desc(dev, vtballoon_feature_desc); vtballoon_negotiate_features(sc); sc->vtballoon_page_frames = malloc(VTBALLOON_PAGES_PER_REQUEST * sizeof(uint32_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtballoon_page_frames == NULL) { error = ENOMEM; device_printf(dev, "cannot allocate page frame request array\n"); goto fail; } error = vtballoon_alloc_virtqueues(sc); if (error) { device_printf(dev, "cannot allocate virtqueues\n"); goto fail; } error = virtio_setup_intr(dev, INTR_TYPE_MISC); if (error) { device_printf(dev, "cannot setup virtqueue interrupts\n"); goto fail; } error = kthread_add(vtballoon_thread, sc, NULL, &sc->vtballoon_td, 0, 0, "virtio_balloon"); if (error) { device_printf(dev, "cannot create balloon kthread\n"); goto fail; } virtqueue_enable_intr(sc->vtballoon_inflate_vq); virtqueue_enable_intr(sc->vtballoon_deflate_vq); fail: if (error) vtballoon_detach(dev); return (error); } static int vtballoon_detach(device_t dev) { struct vtballoon_softc *sc; sc = device_get_softc(dev); if (sc->vtballoon_td != NULL) { VTBALLOON_LOCK(sc); sc->vtballoon_flags |= VTBALLOON_FLAG_DETACH; wakeup_one(sc); msleep(sc->vtballoon_td, VTBALLOON_MTX(sc), 0, "vtbdth", 0); VTBALLOON_UNLOCK(sc); sc->vtballoon_td = NULL; } if (device_is_attached(dev)) { vtballoon_pop(sc); vtballoon_stop(sc); } if (sc->vtballoon_page_frames != NULL) { free(sc->vtballoon_page_frames, M_DEVBUF); sc->vtballoon_page_frames = NULL; } VTBALLOON_LOCK_DESTROY(sc); return (0); } static int vtballoon_config_change(device_t dev) { struct vtballoon_softc *sc; sc = device_get_softc(dev); VTBALLOON_LOCK(sc); wakeup_one(sc); VTBALLOON_UNLOCK(sc); return (1); } static void vtballoon_negotiate_features(struct vtballoon_softc *sc) { device_t dev; uint64_t features; dev = sc->vtballoon_dev; features = virtio_negotiate_features(dev, VTBALLOON_FEATURES); sc->vtballoon_features = features; } static int vtballoon_alloc_virtqueues(struct vtballoon_softc *sc) { device_t dev; struct vq_alloc_info vq_info[2]; int nvqs; dev = sc->vtballoon_dev; nvqs = 2; VQ_ALLOC_INFO_INIT(&vq_info[0], 0, vtballoon_vq_intr, sc, &sc->vtballoon_inflate_vq, "%s inflate", device_get_nameunit(dev)); VQ_ALLOC_INFO_INIT(&vq_info[1], 0, vtballoon_vq_intr, sc, &sc->vtballoon_deflate_vq, "%s deflate", device_get_nameunit(dev)); return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info)); } static void vtballoon_vq_intr(void *xsc) { struct vtballoon_softc *sc; sc = xsc; VTBALLOON_LOCK(sc); wakeup_one(sc); VTBALLOON_UNLOCK(sc); } static void vtballoon_inflate(struct vtballoon_softc *sc, int npages) { struct virtqueue *vq; vm_page_t m; int i; vq = sc->vtballoon_inflate_vq; if (npages > VTBALLOON_PAGES_PER_REQUEST) npages = VTBALLOON_PAGES_PER_REQUEST; for (i = 0; i < npages; i++) { if ((m = vtballoon_alloc_page(sc)) == NULL) { sc->vtballoon_timeout = VTBALLOON_LOWMEM_TIMEOUT; break; } sc->vtballoon_page_frames[i] = VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; + KASSERT(m->queue == PQ_NONE, + ("%s: allocated page %p on queue", __func__, m)); TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, plinks.q); } if (i > 0) vtballoon_send_page_frames(sc, vq, i); } static void vtballoon_deflate(struct vtballoon_softc *sc, int npages) { TAILQ_HEAD(, vm_page) free_pages; struct virtqueue *vq; vm_page_t m; int i; vq = sc->vtballoon_deflate_vq; TAILQ_INIT(&free_pages); if (npages > VTBALLOON_PAGES_PER_REQUEST) npages = VTBALLOON_PAGES_PER_REQUEST; for (i = 0; i < npages; i++) { m = TAILQ_FIRST(&sc->vtballoon_pages); KASSERT(m != NULL, ("%s: no more pages to deflate", __func__)); sc->vtballoon_page_frames[i] = VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; TAILQ_REMOVE(&sc->vtballoon_pages, m, plinks.q); TAILQ_INSERT_TAIL(&free_pages, m, plinks.q); } if (i > 0) { /* Always tell host first before freeing the pages. */ vtballoon_send_page_frames(sc, vq, i); while ((m = TAILQ_FIRST(&free_pages)) != NULL) { TAILQ_REMOVE(&free_pages, m, plinks.q); vtballoon_free_page(sc, m); } } KASSERT((TAILQ_EMPTY(&sc->vtballoon_pages) && sc->vtballoon_current_npages == 0) || (!TAILQ_EMPTY(&sc->vtballoon_pages) && sc->vtballoon_current_npages != 0), ("%s: bogus page count %d", __func__, sc->vtballoon_current_npages)); } static void vtballoon_send_page_frames(struct vtballoon_softc *sc, struct virtqueue *vq, int npages) { struct sglist sg; struct sglist_seg segs[1]; void *c; int error; sglist_init(&sg, 1, segs); error = sglist_append(&sg, sc->vtballoon_page_frames, npages * sizeof(uint32_t)); KASSERT(error == 0, ("error adding page frames to sglist")); error = virtqueue_enqueue(vq, vq, &sg, 1, 0); KASSERT(error == 0, ("error enqueuing page frames to virtqueue")); virtqueue_notify(vq); /* * Inflate and deflate operations are done synchronously. The * interrupt handler will wake us up. */ VTBALLOON_LOCK(sc); while ((c = virtqueue_dequeue(vq, NULL)) == NULL) msleep(sc, VTBALLOON_MTX(sc), 0, "vtbspf", 0); VTBALLOON_UNLOCK(sc); KASSERT(c == vq, ("unexpected balloon operation response")); } static void vtballoon_pop(struct vtballoon_softc *sc) { while (!TAILQ_EMPTY(&sc->vtballoon_pages)) vtballoon_deflate(sc, sc->vtballoon_current_npages); } static void vtballoon_stop(struct vtballoon_softc *sc) { virtqueue_disable_intr(sc->vtballoon_inflate_vq); virtqueue_disable_intr(sc->vtballoon_deflate_vq); virtio_stop(sc->vtballoon_dev); } static vm_page_t vtballoon_alloc_page(struct vtballoon_softc *sc) { vm_page_t m; m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m != NULL) sc->vtballoon_current_npages++; return (m); } static void vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m) { vm_page_free(m); sc->vtballoon_current_npages--; } static uint32_t vtballoon_desired_size(struct vtballoon_softc *sc) { uint32_t desired; desired = virtio_read_dev_config_4(sc->vtballoon_dev, offsetof(struct virtio_balloon_config, num_pages)); return (le32toh(desired)); } static void vtballoon_update_size(struct vtballoon_softc *sc) { virtio_write_dev_config_4(sc->vtballoon_dev, offsetof(struct virtio_balloon_config, actual), htole32(sc->vtballoon_current_npages)); } static int vtballoon_sleep(struct vtballoon_softc *sc) { int rc, timeout; uint32_t current, desired; rc = 0; current = sc->vtballoon_current_npages; VTBALLOON_LOCK(sc); for (;;) { if (sc->vtballoon_flags & VTBALLOON_FLAG_DETACH) { rc = 1; break; } desired = vtballoon_desired_size(sc); sc->vtballoon_desired_npages = desired; /* * If given, use non-zero timeout on the first time through * the loop. On subsequent times, timeout will be zero so * we will reevaluate the desired size of the balloon and * break out to retry if needed. */ timeout = sc->vtballoon_timeout; sc->vtballoon_timeout = 0; if (current > desired) break; if (current < desired && timeout == 0) break; msleep(sc, VTBALLOON_MTX(sc), 0, "vtbslp", timeout); } VTBALLOON_UNLOCK(sc); return (rc); } static void vtballoon_thread(void *xsc) { struct vtballoon_softc *sc; uint32_t current, desired; sc = xsc; for (;;) { if (vtballoon_sleep(sc) != 0) break; current = sc->vtballoon_current_npages; desired = sc->vtballoon_desired_npages; if (desired != current) { if (desired > current) vtballoon_inflate(sc, desired - current); else vtballoon_deflate(sc, current - desired); vtballoon_update_size(sc); } } kthread_exit(); } static void vtballoon_add_sysctl(struct vtballoon_softc *sc) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = sc->vtballoon_dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "desired", CTLFLAG_RD, &sc->vtballoon_desired_npages, sizeof(uint32_t), "Desired balloon size in pages"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "current", CTLFLAG_RD, &sc->vtballoon_current_npages, sizeof(uint32_t), "Current balloon size in pages"); } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 352406) +++ head/sys/i386/i386/pmap.c (revision 352407) @@ -1,6165 +1,6165 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * Copyright (c) 2018 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * PTmap is recursive pagemap at top of virtual address space. * Within PTmap, the page directory can be found (third indirection). */ #define PTmap ((pt_entry_t *)(PTDPTDI << PDRSHIFT)) #define PTD ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE))) #define PTDpde ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE) + \ (PTDPTDI * PDESIZE))) /* * Translate a virtual address to the kernel virtual address of its page table * entry (PTE). This can be used recursively. If the address of a PTE as * previously returned by this macro is itself given as the argument, then the * address of the page directory entry (PDE) that maps the PTE will be * returned. * * This macro may be used before pmap_bootstrap() is called. */ #define vtopte(va) (PTmap + i386_btop(va)) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) _Static_assert(sizeof(struct pmap) <= sizeof(struct pmap_KBI), "pmap_KBI"); static int pgeflag = 0; /* PG_G or-in */ static int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; #ifdef PMAP_PAE_COMP pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS"); _Static_assert(VM_MAX_KERNEL_ADDRESS <= VADDR(PTDPTDI, 0), "VM_MAX_KERNEL_ADDRESS"); _Static_assert(PMAP_MAP_LOW == VADDR(LOWPTDI, 0), "PMAP_MAP_LOW"); _Static_assert(KERNLOAD == (KERNPTDI << PDRSHIFT), "KERNLOAD"); extern int pat_works; extern int pg_ps_enabled; extern int elf32_nxstack; #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); extern int pv_entry_max, pv_entry_count; static int pv_entry_high_water = 0; static struct md_page *pv_table; extern int shpgperproc; static struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ static int pv_maxchunks; /* How many chunks we have KVA for */ static vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ static pt_entry_t *CMAP3; static pd_entry_t *KPTD; static caddr_t CADDR3; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3; static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3; #ifdef SMP static int PMAP1cpu, PMAP3cpu; extern int PMAP1changedcpu; #endif extern int PMAP1changed; extern int PMAP1unchanged; static struct mtx PMAP2mutex; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #ifdef PMAP_PAE_COMP static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait); #endif static void pmap_init_trm(void); static void pmap_invalidate_all_int(pmap_t pmap); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); extern char _end[]; extern u_long physfree; /* phys addr of next free page */ extern u_long vm86phystk;/* PA of vm86/bios stack */ extern u_long vm86paddr;/* address of vm86 region */ extern int vm86pa; /* phys addr of vm86 region */ extern u_long KERNend; /* phys addr end of kernel (just after bss) */ #ifdef PMAP_PAE_COMP pd_entry_t *IdlePTD_pae; /* phys addr of kernel PTD */ pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ pt_entry_t *KPTmap_pae; /* address of kernel page tables */ #define IdlePTD IdlePTD_pae #define KPTmap KPTmap_pae #else pd_entry_t *IdlePTD_nopae; pt_entry_t *KPTmap_nopae; #define IdlePTD IdlePTD_nopae #define KPTmap KPTmap_nopae #endif extern u_long KPTphys; /* phys addr of kernel page tables */ extern u_long tramp_idleptd; static u_long allocpages(u_int cnt, u_long *physfree) { u_long res; res = *physfree; *physfree += PAGE_SIZE * cnt; bzero((void *)res, PAGE_SIZE * cnt); return (res); } static void pmap_cold_map(u_long pa, u_long va, u_long cnt) { pt_entry_t *pt; for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) *pt = pa | PG_V | PG_RW | PG_A | PG_M; } static void pmap_cold_mapident(u_long pa, u_long cnt) { pmap_cold_map(pa, pa, cnt); } _Static_assert(LOWPTDI * 2 * NBPDR == KERNBASE, "Broken double-map of zero PTD"); static void __CONCAT(PMTYPE, remap_lower)(bool enable) { int i; for (i = 0; i < LOWPTDI; i++) IdlePTD[i] = enable ? IdlePTD[LOWPTDI + i] : 0; load_cr3(rcr3()); /* invalidate TLB */ } /* * Called from locore.s before paging is enabled. Sets up the first * kernel page table. Since kernel is mapped with PA == VA, this code * does not require relocations. */ void __CONCAT(PMTYPE, cold)(void) { pt_entry_t *pt; u_long a; u_int cr3, ncr4; physfree = (u_long)&_end; if (bootinfo.bi_esymtab != 0) physfree = bootinfo.bi_esymtab; if (bootinfo.bi_kernend != 0) physfree = bootinfo.bi_kernend; physfree = roundup2(physfree, NBPDR); KERNend = physfree; /* Allocate Kernel Page Tables */ KPTphys = allocpages(NKPT, &physfree); KPTmap = (pt_entry_t *)KPTphys; /* Allocate Page Table Directory */ #ifdef PMAP_PAE_COMP /* XXX only need 32 bytes (easier for now) */ IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); #endif IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); /* * Allocate KSTACK. Leave a guard page between IdlePTD and * proc0kstack, to control stack overflow for thread0 and * prevent corruption of the page table. We leak the guard * physical memory due to 1:1 mappings. */ allocpages(1, &physfree); proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); /* vm86/bios stack */ vm86phystk = allocpages(1, &physfree); /* pgtable + ext + IOPAGES */ vm86paddr = vm86pa = allocpages(3, &physfree); /* Install page tables into PTD. Page table page 1 is wasted. */ for (a = 0; a < NKPT; a++) IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; #ifdef PMAP_PAE_COMP /* PAE install PTD pointers into PDPT */ for (a = 0; a < NPGPTD; a++) IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; #endif /* * Install recursive mapping for kernel page tables into * itself. */ for (a = 0; a < NPGPTD; a++) IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | PG_RW; /* * Initialize page table pages mapping physical address zero * through the (physical) end of the kernel. Many of these * pages must be reserved, and we reserve them all and map * them linearly for convenience. We do this even if we've * enabled PSE above; we'll just switch the corresponding * kernel PDEs before we turn on paging. * * This and all other page table entries allow read and write * access for various reasons. Kernel mappings never have any * access restrictions. */ pmap_cold_mapident(0, atop(NBPDR) * LOWPTDI); pmap_cold_map(0, NBPDR * LOWPTDI, atop(NBPDR) * LOWPTDI); pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); /* Map page table directory */ #ifdef PMAP_PAE_COMP pmap_cold_mapident((u_long)IdlePDPT, 1); #endif pmap_cold_mapident((u_long)IdlePTD, NPGPTD); /* Map early KPTmap. It is really pmap_cold_mapident. */ pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT); /* Map proc0kstack */ pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); /* ISA hole already mapped */ pmap_cold_mapident(vm86phystk, 1); pmap_cold_mapident(vm86pa, 3); /* Map page 0 into the vm86 page table */ *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; /* ...likewise for the ISA hole for vm86 */ for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; a < atop(ISA_HOLE_LENGTH); a++, pt++) *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | PG_M | PG_V; /* Enable PSE, PGE, VME, and PAE if configured. */ ncr4 = 0; if ((cpu_feature & CPUID_PSE) != 0) { ncr4 |= CR4_PSE; pseflag = PG_PS; /* * Superpage mapping of the kernel text. Existing 4k * page table pages are wasted. */ for (a = KERNBASE; a < KERNend; a += NBPDR) IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | PG_RW | PG_V; } if ((cpu_feature & CPUID_PGE) != 0) { ncr4 |= CR4_PGE; pgeflag = PG_G; } ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; #ifdef PMAP_PAE_COMP ncr4 |= CR4_PAE; #endif if (ncr4 != 0) load_cr4(rcr4() | ncr4); /* Now enable paging */ #ifdef PMAP_PAE_COMP cr3 = (u_int)IdlePDPT; if ((cpu_feature & CPUID_PAT) == 0) wbinvd(); #else cr3 = (u_int)IdlePTD; #endif tramp_idleptd = cr3; load_cr3(cr3); load_cr0(rcr0() | CR0_PG); /* * Now running relocated at KERNBASE where the system is * linked to run. */ /* * Remove the lowest part of the double mapping of low memory * to get some null pointer checks. */ __CONCAT(PMTYPE, remap_lower)(false); kernel_vm_end = /* 0 + */ NKPT * NBPDR; #ifdef PMAP_PAE_COMP i386_pmap_VM_NFREEORDER = VM_NFREEORDER_PAE; i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_PAE; i386_pmap_PDRSHIFT = PDRSHIFT_PAE; #else i386_pmap_VM_NFREEORDER = VM_NFREEORDER_NOPAE; i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_NOPAE; i386_pmap_PDRSHIFT = PDRSHIFT_NOPAE; #endif } static void __CONCAT(PMTYPE, set_nx)(void) { #ifdef PMAP_PAE_COMP if ((amd_feature & AMDID_NX) == 0) return; pg_nx = PG_NX; elf32_nxstack = 1; /* EFER.EFER_NXE is set in initializecpu(). */ #endif } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after pmap_cold() created initial * kernel page table and enabled paging, and just syncs the pmap * module with what has already been done. */ static void __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; u_long res; int i; res = atop(firstaddr - (vm_paddr_t)KERNLOAD); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. * However, using "firstaddr" may waste a few pages of the * kernel virtual address space, because pmap_cold() may not * have mapped every physical page that it allocated. * Preferably, pmap_cold() would provide a first unused * virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t)firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = IdlePTD; #ifdef PMAP_PAE_COMP kernel_pmap->pm_pdpt = IdlePDPT; #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ kernel_pmap->pm_stats.resident_count = res; TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by pmap_cold(). However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; #ifdef PMAP_PAE_COMP if (!pae_mode) return; #else if (pae_mode) return; #endif CPU_FOREACH(i) { pc = pcpu_find(i); mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | MTX_NEW); pc->pc_copyout_maddr = kva_alloc(ptoa(2)); if (pc->pc_copyout_maddr == 0) panic("unable to allocate non-sleepable copyout KVA"); sx_init(&pc->pc_copyout_slock, "cpslk"); pc->pc_copyout_saddr = kva_alloc(ptoa(2)); if (pc->pc_copyout_saddr == 0) panic("unable to allocate sleepable copyout KVA"); pc->pc_pmap_eh_va = kva_alloc(ptoa(1)); if (pc->pc_pmap_eh_va == 0) panic("unable to allocate pmap_extract_and_hold KVA"); pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va); /* * Skip if the mappings have already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("unable to allocate CMAP KVA"); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + ptoa(2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ static void __CONCAT(PMTYPE, init_pat)(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } #ifdef PMAP_PAE_COMP static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ static void __CONCAT(PMTYPE, init)(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + ptoa(i); mpte->wire_count = 1; /* * Collect the page table pages that were replaced by a 2/4MB * page. They are filled with equivalent 4KB page mappings. */ if (pseflag != 0 && KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, true)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(NKPT); /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #ifdef PMAP_PAE_COMP pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; pmap_init_trm(); if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } extern u_long pmap_pde_demotions; extern u_long pmap_pde_mappings; extern u_long pmap_pde_p_failures; extern u_long pmap_pde_promotions; /*************************************************** * Low level helper routines..... ***************************************************/ static boolean_t __CONCAT(PMTYPE, is_valid_memattr)(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } static bool __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused) { return (pg_ps_enabled); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pde = pmap_pde(kernel_pmap, va); pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else /* if ((newpde & PG_G) == 0) */ /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ static void pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invlpg(va); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va, pmap); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all_int(pmap); return; } sched_pin(); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva, pmap); sched_unpin(); } static void pmap_invalidate_all_int(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } static void __CONCAT(PMTYPE, invalidate_cache)(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; if (act->store == PCPU_GET(cpuid)) { pde = pmap_pde(kernel_pmap, act->va); pde_store(pde, act->newpde); } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ static void pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap) invlpg(va); } static void pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } static void pmap_invalidate_all_int(pmap_t pmap) { if (pmap == kernel_pmap) invltlb(); } static void __CONCAT(PMTYPE, invalidate_cache)(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ static void __CONCAT(PMTYPE, invalidate_page)(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_int(pmap, va); } static void __CONCAT(PMTYPE, invalidate_range)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_int(pmap, sva, eva); } static void __CONCAT(PMTYPE, invalidate_all)(pmap_t pmap) { pmap_invalidate_all_int(pmap); } static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was * created by a promotion that did not invalidate the 512 or 1024 4KB * page mappings that might exist in the TLB. Consequently, at this * point, the TLB may hold both 4KB and 2- or 4MB page mappings for * the address range [va, va + NBPDR). Therefore, the entire range * must be invalidated here. In contrast, when PG_PROMOTED is clear, * the TLB will not hold any 4KB page mappings for the address range * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the * 2- or 4MB page mapping from the TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range_int(pmap, va, va + NBPDR - 1); else pmap_invalidate_page_int(pmap, va); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ static pt_entry_t * __CONCAT(PMTYPE, pte)(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page_int(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static pt_entry_t * pmap_pte_quick3(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP3 & PG_FRAME) != newpf) { *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR3); PMAP1changed++; } else #ifdef SMP if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); invlcaddr(PADDR3); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR3 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static pt_entry_t pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { pt_entry_t *eh_ptep, pte, *ptep; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde &= PG_FRAME; critical_enter(); eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep); if ((*eh_ptep & PG_FRAME) != pde) { *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M; invlcaddr((void *)PCPU_GET(pmap_eh_va)); } ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) & (NPTEPG - 1)); pte = *ptep; critical_exit(); return (pte); } /* * Extract from the kernel page table the physical address that is mapped by * the given virtual address "va". * * This function may be used before pmap_bootstrap() is called. */ static vm_paddr_t __CONCAT(PMTYPE, kextract)(vm_offset_t va) { vm_paddr_t pa; if ((pa = pte_load(&PTD[va >> PDRSHIFT])) & PG_PS) { pa = (pa & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the PDE at * this point! For example, vtopte() must not be used to * access the PTE because it would use the new PDE. It is, * however, safe to use the old PDE because the page table * page is preserved by the promotion. */ pa = KPTmap[i386_btop(va)]; pa = (pa & PG_FRAME) | (va & PAGE_MASK); } return (pa); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ static vm_paddr_t __CONCAT(PMTYPE, extract)(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte_ufast(pmap, va, pde); rtval = (pte & PG_FRAME) | (va & PAGE_MASK); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ static vm_page_t __CONCAT(PMTYPE, extract_and_hold)(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); } else { pte = pmap_pte_ufast(pmap, va, pde); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) m = PHYS_TO_VM_PAGE(pte & PG_FRAME); } if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ static void __CONCAT(PMTYPE, kenter)(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ static void __CONCAT(PMTYPE, kremove)(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ static vm_offset_t __CONCAT(PMTYPE, map)(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag != 0) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range_int(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ static void __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; #ifdef PMAP_PAE_COMP pte_store(pte, pa | pg_nx | PG_RW | PG_V); #else pte_store(pte, pa | PG_RW | PG_V); #endif } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range_int(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ static void __CONCAT(PMTYPE, qremove)(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range_int(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * There is not need to invalidate the recursive mapping since * we never instantiate such mapping for the usermode pmaps, * and never remove page table pages from the kernel pmap. * Put page on a list so that it is released since all TLB * shootdown is done. */ MPASS(pmap != kernel_pmap); pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (pmap == kernel_pmap) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ static void __CONCAT(PMTYPE, pinit0)(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pdir = IdlePTD; #ifdef PMAP_PAE_COMP pmap->pm_pdpt = IdlePDPT; #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap_activate_boot(pmap); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ static int __CONCAT(PMTYPE, pinit)(pmap_t pmap) { vm_page_t m; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #ifdef PMAP_PAE_COMP pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD; i++) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pmap->pm_ptdpg[i] = m; #ifdef PMAP_PAE_COMP pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V; #endif } pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); #ifdef PMAP_PAE_COMP if ((cpu_feature & CPUID_PAT) == 0) { pmap_invalidate_cache_range( trunc_page((vm_offset_t)pmap->pm_pdpt), round_page((vm_offset_t)pmap->pm_pdpt + NPGPTD * sizeof(pdpt_entry_t))); } #endif for (i = 0; i < NPGPTD; i++) if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); /* Install the trampoline mapping. */ pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ static void __CONCAT(PMTYPE, release)(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = pmap->pm_ptdpg[i]; #ifdef PMAP_PAE_COMP KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * grow the number of kernel page table entries, if needed */ static void __CONCAT(PMTYPE, growkernel)(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; #ifdef PV_STATS extern int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; extern long pv_entry_frees, pv_entry_allocs; extern int pv_entry_spare; #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all_int(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = __CONCAT(PMTYPE, pte)(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page_int(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all_int(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire_noq(m); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entries tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags) { struct md_page *pvh; pv_entry_t pv; bool noreclaim; rw_assert(&pvh_global_lock, RA_WLOCKED); noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; if ((noreclaim && pv_entry_count >= pv_entry_high_water) || (pv = get_pv_entry(pmap, noreclaim)) == NULL) return (false); pv->pv_va = va; pvh = pa_to_pvh(pde & PG_PS_FRAME); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (pmap != kernel_pmap) { mpte->wire_count = NPTEPG; pmap->pm_stats.resident_count++; } } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (pmap == kernel_pmap) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page_int(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page_int(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page_int(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct spglist *free) { pt_entry_t *pte; bool anyvalid; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); anyvalid = false; for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated by * pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = true; if (pmap_remove_pte(pmap, pte, sva, free)) break; } return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ static void __CONCAT(PMTYPE, remove)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; if (pmap_remove_ptes(pmap, sva, pdnxt, &free)) anyvalid = 1; } out: sched_unpin(); if (anyvalid) pmap_invalidate_all_int(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void __CONCAT(PMTYPE, remove_all)(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page_int(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpde |= pg_nx; #endif if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ static void __CONCAT(PMTYPE, protect)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #ifdef PMAP_PAE_COMP if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all_int( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) pbits |= pg_nx; #endif if (pbits != obits) { #ifdef PMAP_PAE_COMP if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page_int(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all_int(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ static int __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; va = trunc_page(va); KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); KASSERT(va < PMAP_TRM_MIN_ADDRESS, ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", va)); KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpte |= pg_nx; #endif if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (pmap != kernel_pmap) newpte |= PG_U; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m); goto out; } pde = pmap_pde(pmap, va); if (pmap != kernel_pmap) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rv = KERN_RESOURCE_SHORTAGE; goto out; } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ mpte = NULL; KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } origpte = *pte; pv = NULL; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#x", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#x", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) pmap_invalidate_page_int(pmap, va); origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap->pm_stats.resident_count++; } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#x", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } #ifdef PMAP_PAE_COMP else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } #endif if ((origpte & PG_A) != 0) pmap_invalidate_page_int(pmap, va); } else pte_store_zero(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); #endif rv = KERN_SUCCESS; out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2 or 4 MB page mapping. Returns * true if successful. Returns false if (1) a mapping already exists at the * specified virtual address or (2) a PV entry cannot be allocated without * reclaiming another PV entry. */ static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t newpde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpde |= pg_nx; #endif if (pmap != kernel_pmap) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) == KERN_SUCCESS); } /* * Tries to create the specified 2 or 4 MB page mapping. Returns KERN_SUCCESS * if the mapping was created, and either KERN_FAILURE or * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the * specified virtual address. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m) { struct spglist free; pd_entry_t oldpde, *pde; vm_page_t mt; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_V) != 0) { if ((flags & PMAP_ENTER_NOREPLACE) != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * If the PDE resulted from a promotion, then a * reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { if (pmap_remove_ptes(pmap, va, va + NBPDR, &free)) pmap_invalidate_all_int(pmap); } vm_page_free_pages_toq(&free, true); if (pmap == kernel_pmap) { /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ static void __CONCAT(PMTYPE, enter_object)(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_4mpage(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ static void __CONCAT(PMTYPE, enter_quick)(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t newpte, *pte; struct spglist free; KASSERT(pmap != kernel_pmap || va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (pmap != kernel_pmap) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } sched_pin(); pte = pmap_pte_quick(pmap, va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } sched_unpin(); return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page_int(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } sched_unpin(); return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpte |= pg_nx; #endif if (pmap != kernel_pmap) newpte |= PG_U; pte_store_zero(pte, newpte); sched_unpin(); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ static void * __CONCAT(PMTYPE, kenter_temporary)(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ static void __CONCAT(PMTYPE, object_init_pt)(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pg_ps_enabled && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ static void __CONCAT(PMTYPE, unwire)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. Since * current pmap is always the kernel pmap when executing in * kernel, and we do not copy from the kernel pmap to a user * pmap, this optimization is not usable in 4/4G full split i386 * world. */ static void __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; pt_entry_t *src_pte, *dst_pte, ptetemp; pd_entry_t srcptepaddr; vm_page_t dstmpte, srcmpte; vm_offset_t addr, end_addr, pdnxt; u_int ptepindex; if (dst_addr != src_addr) return; end_addr = src_addr + len; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { KASSERT(addr < PMAP_TRM_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy the trampoline")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = pmap_pte_quick3(src_pmap, addr); while (addr < pdnxt) { ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page_int( dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ static void __CONCAT(PMTYPE, zero_page)(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ static void __CONCAT(PMTYPE, zero_page_area)(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Copy 1 specified hardware page to another. */ static void __CONCAT(PMTYPE, copy_page)(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } static void __CONCAT(PMTYPE, copy_pages)(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ static boolean_t __CONCAT(PMTYPE, page_exists_quick)(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ static int __CONCAT(PMTYPE, page_wired_mappings)(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ static boolean_t __CONCAT(PMTYPE, page_is_mapped)(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ static void __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = pmap_pte_quick(pmap, pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all_int(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ static boolean_t __CONCAT(PMTYPE, is_modified)(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ static boolean_t __CONCAT(PMTYPE, is_prefaultable)(pmap_t pmap, vm_offset_t addr) { pd_entry_t pde; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = *pmap_pde(pmap, addr); if (pde != 0 && (pde & PG_PS) == 0) rv = pmap_pte_ufast(pmap, addr, pde) == 0; PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t __CONCAT(PMTYPE, is_referenced)(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ static void __CONCAT(PMTYPE, remove_write)(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page_int(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ static int __CONCAT(PMTYPE, ts_referenced)(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page_int(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page_int(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ static void __CONCAT(PMTYPE, advise)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; bool anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = false; else { pv_lists_locked = true; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = false; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all_int(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(pdnxt, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldpde & PG_W) == 0) { va = eva; if (va > pdnxt) va = pdnxt; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); pte = pmap_pte_quick(pmap, va); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, NULL); anychanged = true; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = true; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range_int(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range_int(pmap, va, sva); } if (anychanged) pmap_invalidate_all_int(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ static void __CONCAT(PMTYPE, clear_modify)(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && pmap_demote_pde(pmap, pde, va) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_M | PG_RW); vm_page_dirty(m); pmap_invalidate_page_int(pmap, va); } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page_int(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * __CONCAT(PMTYPE, mapdev_attr)(vm_paddr_t pa, vm_size_t size, int mode, int flags) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_page_t m; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) { va = pa + PMAP_MAP_LOW; if ((flags & MAPDEV_SETATTR) == 0) return ((void *)(va + offset)); } else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && (ppim->mode == mode || (flags & MAPDEV_SETATTR) == 0)) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) { if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) { m = PHYS_TO_VM_PAGE(pa); if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) { pmap_kenter_attr(va + tmpsize, pa + tmpsize, m->md.pat_mode); continue; } } pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); } pmap_invalidate_range_int(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size); return ((void *)(va + offset)); } static void __CONCAT(PMTYPE, unmapdev)(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ static void __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void __CONCAT(PMTYPE, flush_page)(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ static int __CONCAT(PMTYPE, change_attr)(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range_int(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva); } return (0); } /* * perform the pmap work for mincore */ static int __CONCAT(PMTYPE, mincore)(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t pde; pt_entry_t pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, addr); if (pde != 0) { if ((pde & PG_PS) != 0) { pte = pde; /* Compute the physical address of the 4KB page. */ pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = pmap_pte_ufast(pmap, addr, pde); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static void __CONCAT(PMTYPE, activate)(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #ifdef PMAP_PAE_COMP cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; PCPU_SET(curpmap, pmap); critical_exit(); } static void __CONCAT(PMTYPE, activate_boot)(pmap_t pmap) { u_int cpuid; cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ static void __CONCAT(PMTYPE, align_superpage)(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } static vm_offset_t __CONCAT(PMTYPE, quick_enter_page)(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy %#jx", (uintmax_t)*pte)); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } static void __CONCAT(PMTYPE, quick_remove_page)(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } static vmem_t *pmap_trm_arena; static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; static int trm_guard = PAGE_SIZE; static int pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, vmem_addr_t *addrp) { vm_page_t m; vmem_addr_t af, addr, prev_addr; pt_entry_t *trm_pte; prev_addr = atomic_load_long(&pmap_trm_arena_last); size = round_page(size) + trm_guard; for (;;) { if (prev_addr + size < prev_addr || prev_addr + size < size || prev_addr + size > PMAP_TRM_MAX_ADDRESS) return (ENOMEM); addr = prev_addr + size; if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) break; } prev_addr += trm_guard; trm_pte = PTmap + atop(prev_addr); for (af = prev_addr; af < addr; af += PAGE_SIZE) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | PG_M | PG_A | PG_RW | PG_V | pgeflag | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE)); } *addrp = prev_addr; return (0); } void pmap_init_trm(void) { vm_page_t pd_m; TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard); if ((trm_guard & PAGE_MASK) != 0) trm_guard = 0; pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); if ((pd_m->flags & PG_ZERO) == 0) pmap_zero_page(pd_m); PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE); } static void * __CONCAT(PMTYPE, trm_alloc)(size_t size, int flags) { vmem_addr_t res; int error; MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); if (error != 0) return (NULL); if ((flags & M_ZERO) != 0) bzero((void *)res, size); return ((void *)res); } static void __CONCAT(PMTYPE, trm_free)(void *addr, size_t size) { vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); } static void __CONCAT(PMTYPE, ksetrw)(vm_offset_t va) { *vtopte(va) |= PG_RW; } static void __CONCAT(PMTYPE, remap_lowptdi)(bool enable) { PTD[KPTDI] = enable ? PTD[LOWPTDI] : 0; invltlb_glob(); } static vm_offset_t __CONCAT(PMTYPE, get_map_low)(void) { return (PMAP_MAP_LOW); } static vm_offset_t __CONCAT(PMTYPE, get_vm_maxuser_address)(void) { return (VM_MAXUSER_ADDRESS); } static vm_paddr_t __CONCAT(PMTYPE, pg_frame)(vm_paddr_t pa) { return (pa & PG_FRAME); } static void __CONCAT(PMTYPE, sf_buf_map)(struct sf_buf *sf) { pt_entry_t opte, *ptep; /* * Update the sf_buf's virtual-to-physical mapping, flushing the * virtual address from the TLB. Since the reference count for * the sf_buf's old mapping was zero, that mapping is not * currently in use. Consequently, there is no need to exchange * the old and new PTEs atomically, even under PAE. */ ptep = vtopte(sf->kva); opte = *ptep; *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, sf->m->md.pat_mode, 0); /* * Avoid unnecessary TLB invalidations: If the sf_buf's old * virtual-to-physical mapping was not used, then any processor * that has invalidated the sf_buf's virtual address from its TLB * since the last used mapping need not invalidate again. */ #ifdef SMP if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) CPU_ZERO(&sf->cpumask); #else if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) pmap_invalidate_page_int(kernel_pmap, sf->kva); #endif } static void __CONCAT(PMTYPE, cp_slow0_map)(vm_offset_t kaddr, int plen, vm_page_t *ma) { pt_entry_t *pte; int i; for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) { *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(ma[i]) | pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(ma[i]), FALSE); invlpg(kaddr + ptoa(i)); } } static u_int __CONCAT(PMTYPE, get_kcr3)(void) { #ifdef PMAP_PAE_COMP return ((u_int)IdlePDPT); #else return ((u_int)IdlePTD); #endif } static u_int __CONCAT(PMTYPE, get_cr3)(pmap_t pmap) { #ifdef PMAP_PAE_COMP return ((u_int)vtophys(pmap->pm_pdpt)); #else return ((u_int)vtophys(pmap->pm_pdir)); #endif } static caddr_t __CONCAT(PMTYPE, cmap3)(vm_paddr_t pa, u_int pte_bits) { pt_entry_t *pte; pte = CMAP3; *pte = pa | pte_bits; invltlb(); return (CADDR3); } static void __CONCAT(PMTYPE, basemem_setup)(u_int basemem) { pt_entry_t *pte; int i; /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using * the vm86 map too. XXX: why 2 ways for this and only 1 way for * page 0, at least as initialized here? */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } struct bios16_pmap_handle { pt_entry_t *pte; pd_entry_t *ptd; pt_entry_t orig_ptd; }; static void * __CONCAT(PMTYPE, bios16_enter)(void) { struct bios16_pmap_handle *h; /* * no page table, so create one and install it. */ h = malloc(sizeof(struct bios16_pmap_handle), M_TEMP, M_WAITOK); h->pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); h->ptd = IdlePTD; *h->pte = vm86phystk | PG_RW | PG_V; h->orig_ptd = *h->ptd; *h->ptd = vtophys(h->pte) | PG_RW | PG_V; pmap_invalidate_all_int(kernel_pmap); /* XXX insurance for now */ return (h); } static void __CONCAT(PMTYPE, bios16_leave)(void *arg) { struct bios16_pmap_handle *h; h = arg; *h->ptd = h->orig_ptd; /* remove page table */ /* * XXX only needs to be invlpg(0) but that doesn't work on the 386 */ pmap_invalidate_all_int(kernel_pmap); free(h->pte, M_TEMP); /* ... and free it */ } #define PMM(a) \ .pm_##a = __CONCAT(PMTYPE, a), struct pmap_methods __CONCAT(PMTYPE, methods) = { PMM(ksetrw) PMM(remap_lower) PMM(remap_lowptdi) PMM(align_superpage) PMM(quick_enter_page) PMM(quick_remove_page) PMM(trm_alloc) PMM(trm_free) PMM(get_map_low) PMM(get_vm_maxuser_address) PMM(kextract) PMM(pg_frame) PMM(sf_buf_map) PMM(cp_slow0_map) PMM(get_kcr3) PMM(get_cr3) PMM(cmap3) PMM(basemem_setup) PMM(set_nx) PMM(bios16_enter) PMM(bios16_leave) PMM(bootstrap) PMM(is_valid_memattr) PMM(cache_bits) PMM(ps_enabled) PMM(pinit0) PMM(pinit) PMM(activate) PMM(activate_boot) PMM(advise) PMM(clear_modify) PMM(change_attr) PMM(mincore) PMM(copy) PMM(copy_page) PMM(copy_pages) PMM(zero_page) PMM(zero_page_area) PMM(enter) PMM(enter_object) PMM(enter_quick) PMM(kenter_temporary) PMM(object_init_pt) PMM(unwire) PMM(page_exists_quick) PMM(page_wired_mappings) PMM(page_is_mapped) PMM(remove_pages) PMM(is_modified) PMM(is_prefaultable) PMM(is_referenced) PMM(remove_write) PMM(ts_referenced) PMM(mapdev_attr) PMM(unmapdev) PMM(page_set_memattr) PMM(extract) PMM(extract_and_hold) PMM(map) PMM(qenter) PMM(qremove) PMM(release) PMM(remove) PMM(protect) PMM(remove_all) PMM(init) PMM(init_pat) PMM(growkernel) PMM(invalidate_page) PMM(invalidate_range) PMM(invalidate_all) PMM(invalidate_cache) PMM(flush_page) PMM(kenter) PMM(kremove) }; Index: head/sys/mips/mips/pmap.c =================================================================== --- head/sys/mips/mips/pmap.c (revision 352406) +++ head/sys/mips/mips/pmap.c (revision 352407) @@ -1,3698 +1,3698 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: src/sys/i386/i386/pmap.c,v 1.250.2.8 2000/11/21 00:09:14 ps * JNPR: pmap.c,v 1.11.2.1 2007/08/16 11:51:06 girish */ /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef PMAP_DEBUG #if !defined(DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_seg_index(v) (((v) >> SEGSHIFT) & (NPDEPG - 1)) #define pmap_pde_index(v) (((v) >> PDRSHIFT) & (NPDEPG - 1)) #define pmap_pte_index(v) (((v) >> PAGE_SHIFT) & (NPTEPG - 1)) #define pmap_pde_pindex(v) ((v) >> PDRSHIFT) #ifdef __mips_n64 #define NUPDE (NPDEPG * NPDEPG) #define NUSERPGTBLS (NUPDE + NPDEPG) #else #define NUPDE (NPDEPG) #define NUSERPGTBLS (NUPDE) #endif #define is_kernel_pmap(x) ((x) == kernel_pmap) struct pmap kernel_pmap_store; pd_entry_t *kernel_segmap; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static int nkpt; unsigned pmap_max_asid; /* max ASID supported by the system */ #define PMAP_ASID_RESERVED 0 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; static void pmap_asid_alloc(pmap_t pmap); static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_alloc_direct_page(unsigned int index, int req); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_grow_direct_page(int req); static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m); static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte); static void pmap_invalidate_all(pmap_t pmap); static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t); static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot); static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_range_action(void *arg); static void pmap_update_page_action(void *arg); #ifndef __mips_n64 /* * This structure is for high memory (memory above 512Meg in 32 bit) support. * The highmem area does not have a KSEG0 mapping, and we need a mechanism to * do temporary per-CPU mappings for pmap_zero_page, pmap_copy_page etc. * * At bootup, we reserve 2 virtual pages per CPU for mapping highmem pages. To * access a highmem physical address on a CPU, we map the physical address to * the reserved virtual address for the CPU in the kernel pagetable. This is * done with interrupts disabled(although a spinlock and sched_pin would be * sufficient). */ struct local_sysmaps { vm_offset_t base; uint32_t saved_intr; uint16_t valid1, valid2; }; static struct local_sysmaps sysmap_lmem[MAXCPU]; static __inline void pmap_alloc_lmem_map(void) { int i; for (i = 0; i < MAXCPU; i++) { sysmap_lmem[i].base = virtual_avail; virtual_avail += PAGE_SIZE * 2; sysmap_lmem[i].valid1 = sysmap_lmem[i].valid2 = 0; } } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { struct local_sysmaps *sysm; pt_entry_t *pte, npte; vm_offset_t va; uint32_t intr; int cpu; intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va = sysm->base; npte = TLBLO_PA_TO_PFN(phys) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va); *pte = npte; sysm->valid1 = 1; return (va); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { struct local_sysmaps *sysm; pt_entry_t *pte, npte; vm_offset_t va1, va2; uint32_t intr; int cpu; intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; sysm->saved_intr = intr; va1 = sysm->base; va2 = sysm->base + PAGE_SIZE; npte = TLBLO_PA_TO_PFN(phys1) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va1); *pte = npte; npte = TLBLO_PA_TO_PFN(phys2) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, va2); *pte = npte; sysm->valid1 = 1; sysm->valid2 = 1; return (va1); } static __inline void pmap_lmem_unmap(void) { struct local_sysmaps *sysm; pt_entry_t *pte; int cpu; cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; pte = pmap_pte(kernel_pmap, sysm->base); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, sysm->base); sysm->valid1 = 0; if (sysm->valid2) { pte = pmap_pte(kernel_pmap, sysm->base + PAGE_SIZE); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, sysm->base + PAGE_SIZE); sysm->valid2 = 0; } intr_restore(sysm->saved_intr); } #else /* __mips_n64 */ static __inline void pmap_alloc_lmem_map(void) { } static __inline vm_offset_t pmap_lmem_map1(vm_paddr_t phys) { return (0); } static __inline vm_offset_t pmap_lmem_map2(vm_paddr_t phys1, vm_paddr_t phys2) { return (0); } static __inline vm_offset_t pmap_lmem_unmap(void) { return (0); } #endif /* !__mips_n64 */ static __inline int pmap_pte_cache_bits(vm_paddr_t pa, vm_page_t m) { vm_memattr_t ma; ma = pmap_page_get_memattr(m); if (ma == VM_MEMATTR_WRITE_BACK && !is_cacheable_mem(pa)) ma = VM_MEMATTR_UNCACHEABLE; return PTE_C(ma); } #define PMAP_PTE_SET_CACHE_BITS(pte, ps, m) { \ pte &= ~PTE_C_MASK; \ pte |= pmap_pte_cache_bits(pa, m); \ } /* * Page table entry lookup routines. */ static __inline pd_entry_t * pmap_segmap(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_segtab[pmap_seg_index(va)]); } #ifdef __mips_n64 static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)*pdpe; return (&pde[pmap_pde_index(va)]); } static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pd_entry_t *pdpe; pdpe = pmap_segmap(pmap, va); if (*pdpe == NULL) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } #else static __inline pd_entry_t * pmap_pdpe_to_pde(pd_entry_t *pdpe, vm_offset_t va) { return (pdpe); } static __inline pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va) { return (pmap_segmap(pmap, va)); } #endif static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)*pde; return (&pte[pmap_pte_index(va)]); } pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pde = pmap_pde(pmap, va); if (pde == NULL || *pde == NULL) return (NULL); return (pmap_pde_to_pte(pde, va)); } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_paddr_t bank_size, pa; vm_offset_t va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i + 2]; i += 2) { phys_avail[i] = phys_avail[i + 2]; phys_avail[i + 1] = phys_avail[i + 3]; } phys_avail[i] = 0; phys_avail[i + 1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; if (MIPS_DIRECT_MAPPABLE(pa) == 0) panic("Out of memory below 512Meg?"); va = MIPS_PHYS_TO_DIRECT(pa); bzero((caddr_t)va, size); return (va); } /* * Bootstrap the system enough to run with virtual memory. This * assumes that the phys_avail array has been initialized. */ static void pmap_create_kernel_pagetable(void) { int i, j; vm_offset_t ptaddr; pt_entry_t *pte; #ifdef __mips_n64 pd_entry_t *pde; vm_offset_t pdaddr; int npt, npde; #endif /* * Allocate segment table for the kernel */ kernel_segmap = (pd_entry_t *)pmap_steal_memory(PAGE_SIZE); /* * Allocate second level page tables for the kernel */ #ifdef __mips_n64 npde = howmany(NKPT, NPDEPG); pdaddr = pmap_steal_memory(PAGE_SIZE * npde); #endif nkpt = NKPT; ptaddr = pmap_steal_memory(PAGE_SIZE * nkpt); /* * The R[4-7]?00 stores only one copy of the Global bit in the * translation lookaside buffer for each 2 page entry. Thus invalid * entrys must have the Global bit set so when Entry LO and Entry HI * G bits are anded together they will produce a global bit to store * in the tlb. */ for (i = 0, pte = (pt_entry_t *)ptaddr; i < (nkpt * NPTEPG); i++, pte++) *pte = PTE_G; #ifdef __mips_n64 for (i = 0, npt = nkpt; npt > 0; i++) { kernel_segmap[i] = (pd_entry_t)(pdaddr + i * PAGE_SIZE); pde = (pd_entry_t *)kernel_segmap[i]; for (j = 0; j < NPDEPG && npt > 0; j++, npt--) pde[j] = (pd_entry_t)(ptaddr + (i * NPDEPG + j) * PAGE_SIZE); } #else for (i = 0, j = pmap_seg_index(VM_MIN_KERNEL_ADDRESS); i < nkpt; i++, j++) kernel_segmap[j] = (pd_entry_t)(ptaddr + (i * PAGE_SIZE)); #endif PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_segtab = kernel_segmap; CPU_FILL(&kernel_pmap->pm_active); TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_asid[0].asid = PMAP_ASID_RESERVED; kernel_pmap->pm_asid[0].gen = 0; kernel_vm_end += nkpt * NPTEPG * PAGE_SIZE; } void pmap_bootstrap(void) { int i; int need_local_mappings = 0; /* Sort. */ again: for (i = 0; phys_avail[i + 1] != 0; i += 2) { /* * Keep the memory aligned on page boundary. */ phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); if (i < 2) continue; if (phys_avail[i - 2] > phys_avail[i]) { vm_paddr_t ptemp[2]; ptemp[0] = phys_avail[i + 0]; ptemp[1] = phys_avail[i + 1]; phys_avail[i + 0] = phys_avail[i - 2]; phys_avail[i + 1] = phys_avail[i - 1]; phys_avail[i - 2] = ptemp[0]; phys_avail[i - 1] = ptemp[1]; goto again; } } /* * In 32 bit, we may have memory which cannot be mapped directly. * This memory will need temporary mapping before it can be * accessed. */ if (!MIPS_DIRECT_MAPPABLE(phys_avail[i - 1] - 1)) need_local_mappings = 1; /* * Copy the phys_avail[] array before we start stealing memory from it. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { physmem_desc[i] = phys_avail[i]; physmem_desc[i + 1] = phys_avail[i + 1]; } Maxmem = atop(phys_avail[i - 1]); if (bootverbose) { printf("Physical memory chunk(s):\n"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { vm_paddr_t size; size = phys_avail[i + 1] - phys_avail[i]; printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n", (uintmax_t) phys_avail[i], (uintmax_t) phys_avail[i + 1] - 1, (uintmax_t) size, (uintmax_t) size / PAGE_SIZE); } printf("Maxmem is 0x%0jx\n", ptoa((uintmax_t)Maxmem)); } /* * Steal the message buffer from the beginning of memory. */ msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize); msgbufinit(msgbufp, msgbufsize); /* * Steal thread0 kstack. */ kstack0 = pmap_steal_memory(KSTACK_PAGES << PAGE_SHIFT); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; #ifdef SMP /* * Steal some virtual address space to map the pcpu area. */ virtual_avail = roundup2(virtual_avail, PAGE_SIZE * 2); pcpup = (struct pcpu *)virtual_avail; virtual_avail += PAGE_SIZE * 2; /* * Initialize the wired TLB entry mapping the pcpu region for * the BSP at 'pcpup'. Up until this point we were operating * with the 'pcpup' for the BSP pointing to a virtual address * in KSEG0 so there was no need for a TLB mapping. */ mips_pcpu_tlb_init(PCPU_ADDR(0)); if (bootverbose) printf("pcpu is available at virtual address %p.\n", pcpup); #endif if (need_local_mappings) pmap_alloc_lmem_map(); pmap_create_kernel_pagetable(); pmap_max_asid = VMNUM_PIDS; mips_wr_entryhi(0); mips_wr_pagemask(0); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_flags = VM_MEMATTR_DEFAULT << PV_MEMATTR_SHIFT; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { } /*************************************************** * Low level helper routines..... ***************************************************/ #ifdef SMP static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid, cpu, self; cpuset_t active_cpus; sched_pin(); if (is_kernel_pmap(pmap)) { smp_rendezvous(NULL, fn, NULL, arg); goto out; } /* Force ASID update on inactive CPUs */ CPU_FOREACH(cpu) { if (!CPU_ISSET(cpu, &pmap->pm_active)) pmap->pm_asid[cpu].gen = 0; } cpuid = PCPU_GET(cpuid); /* * XXX: barrier/locking for active? * * Take a snapshot of active here, any further changes are ignored. * tlb update/invalidate should be harmless on inactive CPUs */ active_cpus = pmap->pm_active; self = CPU_ISSET(cpuid, &active_cpus); CPU_CLR(cpuid, &active_cpus); /* Optimize for the case where this cpu is the only active one */ if (CPU_EMPTY(&active_cpus)) { if (self) fn(arg); } else { if (self) CPU_SET(cpuid, &active_cpus); smp_rendezvous_cpus(active_cpus, NULL, fn, NULL, arg); } out: sched_unpin(); } #else /* !SMP */ static __inline void pmap_call_on_active_cpus(pmap_t pmap, void (*fn)(void *), void *arg) { int cpuid; if (is_kernel_pmap(pmap)) { fn(arg); return; } cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &pmap->pm_active)) pmap->pm_asid[cpuid].gen = 0; else fn(arg); } #endif /* SMP */ static void pmap_invalidate_all(pmap_t pmap) { pmap_call_on_active_cpus(pmap, (void (*)(void *))tlb_invalidate_all_user, pmap); } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page_action(void *arg) { struct pmap_invalidate_page_arg *p = arg; tlb_invalidate_address(p->pmap, p->va); } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; pmap_call_on_active_cpus(pmap, pmap_invalidate_page_action, &arg); } struct pmap_invalidate_range_arg { pmap_t pmap; vm_offset_t sva; vm_offset_t eva; }; static void pmap_invalidate_range_action(void *arg) { struct pmap_invalidate_range_arg *p = arg; tlb_invalidate_range(p->pmap, p->sva, p->eva); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_invalidate_range_arg arg; arg.pmap = pmap; arg.sva = sva; arg.eva = eva; pmap_call_on_active_cpus(pmap, pmap_invalidate_range_action, &arg); } struct pmap_update_page_arg { pmap_t pmap; vm_offset_t va; pt_entry_t pte; }; static void pmap_update_page_action(void *arg) { struct pmap_update_page_arg *p = arg; tlb_update(p->pmap, p->va, p->pte); } static void pmap_update_page(pmap_t pmap, vm_offset_t va, pt_entry_t pte) { struct pmap_update_page_arg arg; arg.pmap = pmap; arg.va = va; arg.pte = pte; pmap_call_on_active_cpus(pmap, pmap_update_page_action, &arg); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; vm_offset_t retval = 0; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte) { retval = TLBLO_PTE_TO_PA(*pte) | (va & PAGE_MASK); } PMAP_UNLOCK(pmap); return (retval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t pte, *ptep; vm_paddr_t pa; vm_page_t m; m = NULL; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, va); if (ptep != NULL) { pte = *ptep; if (pte_test(&pte, PTE_V) && (!pte_test(&pte, PTE_RO) || (prot & VM_PROT_WRITE) == 0)) { pa = TLBLO_PTE_TO_PA(pte); m = PHYS_TO_VM_PAGE(pa); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva */ void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { pt_entry_t *pte; pt_entry_t opte, npte; #ifdef PMAP_DEBUG printf("pmap_kenter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif pte = pmap_pte(kernel_pmap, va); opte = *pte; npte = TLBLO_PA_TO_PFN(pa) | PTE_C(ma) | PTE_D | PTE_V | PTE_G; *pte = npte; if (pte_test(&opte, PTE_V) && opte != npte) pmap_update_page(kernel_pmap, va, npte); } void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { KASSERT(is_cacheable_mem(pa), ("pmap_kenter: memory at 0x%lx is not cacheable", (u_long)pa)); pmap_kenter_attr(va, pa, VM_MEMATTR_DEFAULT); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); for (; size > 0; size -= PAGE_SIZE) { /* * XXXCEM: this is somewhat inefficient on SMP systems in that * every single page is individually TLB-invalidated via * rendezvous (pmap_update_page()), instead of invalidating the * entire range via a single rendezvous. */ pmap_kenter_attr(va, pa, VM_MEMATTR_UNCACHEABLE); va += PAGE_SIZE; pa += PAGE_SIZE; } } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); /* * XXXCEM: Similar to pmap_kenter_device, this is inefficient on SMP, * in that pages are invalidated individually instead of a single range * rendezvous. */ for (; size > 0; size -= PAGE_SIZE) { pmap_kremove(va); va += PAGE_SIZE; } } /* * remove a page from the kernel pagetables */ /* PMAP_INLINE */ void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; /* * Write back all caches from the page being destroyed */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; if (MIPS_DIRECT_MAPPABLE(end - 1)) return (MIPS_PHYS_TO_DIRECT(start)); va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; vm_offset_t origva = va; for (i = 0; i < count; i++) { pmap_flush_pvcache(m[i]); pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); va += PAGE_SIZE; } mips_dcache_wbinv_range_index(origva, PAGE_SIZE*count); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { pt_entry_t *pte; vm_offset_t origva; if (count < 1) return; mips_dcache_wbinv_range_index(va, PAGE_SIZE * count); origva = va; do { pte = pmap_pte(kernel_pmap, va); *pte = PTE_G; va += PAGE_SIZE; } while (--count > 0); pmap_invalidate_range(kernel_pmap, origva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static PMAP_INLINE boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m) { pd_entry_t *pde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ #ifdef __mips_n64 if (m->pindex < NUPDE) pde = pmap_pde(pmap, va); else pde = pmap_segmap(pmap, va); #else pde = pmap_pde(pmap, va); #endif *pde = 0; pmap->pm_stats.resident_count--; #ifdef __mips_n64 if (m->pindex < NUPDE) { pd_entry_t *pdp; vm_page_t pdpg; /* * Recursively decrement next level pagetable refcount */ pdp = (pd_entry_t *)*pmap_segmap(pmap, va); pdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pdp)); pmap_unwire_ptp(pmap, va, pdpg); } #endif /* * If the page is finally unwired, simply free it. */ vm_page_free_zero(m); vm_wire_sub(1); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(pde != 0, ("pmap_unuse_pt: pde != 0")); mpte = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(pde)); return (pmap_unwire_ptp(pmap, va, mpte)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_segtab = kernel_segmap; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static void pmap_grow_direct_page(int req) { #ifdef __mips_n64 vm_wait(NULL); #else if (!vm_page_reclaim_contig(req, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) vm_wait(NULL); #endif } static vm_page_t pmap_alloc_direct_page(unsigned int index, int req) { vm_page_t m; m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, req | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) return (NULL); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->pindex = index; return (m); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; int i, req_class; /* * allocate the page directory page */ req_class = VM_ALLOC_NORMAL; while ((ptdpg = pmap_alloc_direct_page(NUSERPGTBLS, req_class)) == NULL) pmap_grow_direct_page(req_class); ptdva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(ptdpg)); pmap->pm_segtab = (pd_entry_t *)ptdva; CPU_ZERO(&pmap->pm_active); for (i = 0; i < MAXCPU; i++) { pmap->pm_asid[i].asid = PMAP_ASID_RESERVED; pmap->pm_asid[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, u_int flags) { vm_offset_t pageva; vm_page_t m; int req_class; /* * Find or fabricate a new pagetable page */ req_class = VM_ALLOC_NORMAL; if ((m = pmap_alloc_direct_page(ptepindex, req_class)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); pmap_grow_direct_page(req_class); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page * table page may have been allocated. */ return (NULL); } /* * Map the pagetable page into the process address space, if it * isn't already there. */ pageva = MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #ifdef __mips_n64 if (ptepindex >= NUPDE) { pmap->pm_segtab[ptepindex - NUPDE] = (pd_entry_t)pageva; } else { pd_entry_t *pdep, *pde; int segindex = ptepindex >> (SEGSHIFT - PDRSHIFT); int pdeindex = ptepindex & (NPDEPG - 1); vm_page_t pg; pdep = &pmap->pm_segtab[segindex]; if (*pdep == NULL) { /* recurse for allocating page dir */ if (_pmap_allocpte(pmap, NUPDE + segindex, flags) == NULL) { /* alloc failed, release current */ vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { pg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pdep)); pg->wire_count++; } /* Next level entry */ pde = (pd_entry_t *)*pdep; pde[pdeindex] = (pd_entry_t)pageva; } #else pmap->pm_segtab[ptepindex] = (pd_entry_t)pageva; #endif pmap->pm_stats.resident_count++; return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { unsigned ptepindex; pd_entry_t *pde; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment the hold * count, and activate it. */ if (pde != NULL && *pde != NULL) { m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(*pde)); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_offset_t ptdva; vm_page_t ptdpg; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); ptdva = (vm_offset_t)pmap->pm_segtab; ptdpg = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS(ptdva)); vm_page_unwire_noq(ptdpg); vm_page_free_zero(ptdpg); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_page_t nkpg; pd_entry_t *pde, *pdpe; pt_entry_t *pte; int i, req_class; mtx_assert(&kernel_map->system_mtx, MA_OWNED); req_class = VM_ALLOC_INTERRUPT; addr = roundup2(addr, NBSEG); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_segmap(kernel_pmap, kernel_vm_end); #ifdef __mips_n64 if (*pdpe == 0) { /* new intermediate page table entry */ nkpg = pmap_alloc_direct_page(nkpt, req_class); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); *pdpe = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); continue; /* try again */ } #endif pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if (*pde != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } /* * This index is bogus, but out of the way */ nkpg = pmap_alloc_direct_page(nkpt, req_class); #ifndef __mips_n64 if (nkpg == NULL && vm_page_reclaim_contig(req_class, 1, 0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0)) nkpg = pmap_alloc_direct_page(nkpt, req_class); #endif if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; *pde = (pd_entry_t)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(nkpg)); /* * The R[4-7]?00 stores only one copy of the Global bit in * the translation lookaside buffer for each 2 page entry. * Thus invalid entrys must have the Global bit set so when * Entry LO and Entry HI G bits are anded together they will * produce a global bit to store in the tlb. */ pte = (pt_entry_t *)*pde; for (i = 0; i < NPTEPG; i++) pte[i] = PTE_G; kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); #ifdef __mips_n64 CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); #else CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); #endif static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #ifdef __mips_n64 #define PC_FREE0_1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful #else #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ #endif static const u_long pc_freemask[_NPCM] = { #ifdef __mips_n64 PC_FREE0_1, PC_FREE0_1, PC_FREE2 #else PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 #endif }; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, oldpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; u_long inuse; int bit, field, freed, idx; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; va = pv->pv_va; pde = pmap_pde(pmap, va); KASSERT(pde != NULL && *pde != 0, ("pmap_pv_reclaim: pde")); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if (pte_test(&oldpte, PTE_W)) continue; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(oldpte)); if (pte_test(&oldpte, PTE_D)) vm_page_dirty(m); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS( (vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / (sizeof(u_long) * NBBY); bit = idx % (sizeof(u_long) * NBBY); pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(MIPS_DIRECT_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; int bit, field, idx; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { idx = field * sizeof(pc->pc_map[field]) * NBBY + bit; pv = &pc->pc_pventry[idx]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, VM_ALLOC_NORMAL | VM_ALLOC_WIRED); if (m == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); dump_add_page(m->phys_addr); pc = (struct pv_chunk *)MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found, pa %lx va %lx", (u_long)VM_PAGE_TO_PHYS(__containerof(pvh, struct vm_page, md)), (u_long)va)); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_page_t mpte, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); return (TRUE); } else return (FALSE); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t pde) { pt_entry_t oldpte; vm_page_t m; vm_paddr_t pa; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Write back all cache lines from the page being unmapped. */ mips_dcache_wbinv_range_index(va, PAGE_SIZE); oldpte = *ptq; if (is_kernel_pmap(pmap)) *ptq = PTE_G; else *ptq = 0; if (pte_test(&oldpte, PTE_W)) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte_test(&oldpte, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(oldpte); m = PHYS_TO_VM_PAGE(pa); if (pte_test(&oldpte, PTE_D)) { KASSERT(!pte_test(&oldpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)va, (uintmax_t)oldpte)); vm_page_dirty(m); } if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); m->md.pv_flags &= ~PV_TABLE_REF; pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, pde)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(struct pmap *pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *ptq; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (pde == NULL || *pde == 0) return; ptq = pmap_pde_to_pte(pde, va); /* * If there is no pte for this address, just skip it! */ if (!pte_test(ptq, PTE_V)) return; (void)pmap_remove_pte(pmap, ptq, va, *pde); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * special handling of removing one page. a very common operation * and easy to short circuit some code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); goto out; } for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde)) { sva += PAGE_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } out: rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte, tpte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); rw_wlock(&pvh_global_lock); if (m->md.pv_flags & PV_TABLE_REF) vm_page_aflag_set(m, PGA_REFERENCED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); /* * If it's last mapping writeback all caches from * the page being destroyed */ if (TAILQ_NEXT(pv, pv_list) == NULL) mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_all: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = *pte; if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&tpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) { KASSERT(!pte_test(&tpte, PTE_RO), ("%s: modified page not writable: va: %p, pte: %#jx", __func__, (void *)pv->pv_va, (uintmax_t)tpte)); vm_page_dirty(m); } pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); pmap_unuse_pt(pmap, pv->pv_va, *pde); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t pbits, *pte; pd_entry_t *pde, *pdpe; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pbits = *pte; if (!pte_test(&pbits, PTE_V) || pte_test(&pbits, PTE_RO)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pte_set(&pbits, PTE_RO); if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); if (pte_test(&pbits, PTE_MANAGED)) { pa = TLBLO_PTE_TO_PA(pbits); m = PHYS_TO_VM_PAGE(pa); vm_page_dirty(m); } if (va == va_next) va = sva; } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } *pte = pbits; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { vm_paddr_t pa, opa; pt_entry_t *pte; pt_entry_t origpte, newpte; pv_entry_t pv; vm_page_t mpte, om; va &= ~PAGE_MASK; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); newpte = TLBLO_PA_TO_PFN(pa) | init_pte_prot(m, flags, prot); if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PTE_W; if (is_kernel_pmap(pmap)) newpte |= PTE_G; PMAP_PTE_SET_CACHE_BITS(newpte, pa, m); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PTE_MANAGED; mpte = NULL; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=%p", (void *)pmap->pm_segtab, (void *)va); } origpte = *pte; KASSERT(!pte_test(&origpte, PTE_D | PTE_RO | PTE_V), ("pmap_enter: modified page not writable: va: %p, pte: %#jx", (void *)va, (uintmax_t)origpte)); opa = TLBLO_PTE_TO_PA(origpte); /* * Mapping has not changed, must be protection or wiring change. */ if (pte_test(&origpte, PTE_V) && opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is * wired, the PT page will be also. */ if (pte_test(&newpte, PTE_W) && !pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count++; else if (!pte_test(&newpte, PTE_W) && pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (pte_test(&origpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (is_kernel_pmap(pmap)) *pte = PTE_G; else *pte = 0; if (pte_test(&origpte, PTE_W)) pmap->pm_stats.wired_count--; if (pte_test(&origpte, PTE_MANAGED)) { om = PHYS_TO_VM_PAGE(opa); if (pte_test(&origpte, PTE_D)) vm_page_dirty(om); if ((om->md.pv_flags & PV_TABLE_REF) != 0) { om->md.pv_flags &= ~PV_TABLE_REF; vm_page_aflag_set(om, PGA_REFERENCED); } pv = pmap_pvh_remove(&om->md, pmap, va); if (!pte_test(&newpte, PTE_MANAGED)) free_pv_entry(pmap, pv); - if (vm_page_aflags(m) & PGA_WRITEABLE) != 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); origpte = 0; if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: %p", (void *)va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if (pte_test(&newpte, PTE_MANAGED)) { m->md.pv_flags |= PV_TABLE_REF; if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pte_test(&newpte, PTE_RO)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Increment counters */ if (pte_test(&newpte, PTE_W)) pmap->pm_stats.wired_count++; validate: #ifdef PMAP_DEBUG printf("pmap_enter: va: %p -> pa: %p\n", (void *)va, (void *)pa); #endif /* * if the mapping or permission bits are different, we need to * update the pte. */ if (origpte != newpte) { *pte = newpte; if (pte_test(&origpte, PTE_V)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if (pte_test(&origpte, PTE_D)) { if (pte_test(&origpte, PTE_MANAGED)) vm_page_dirty(m); } pmap_update_page(pmap, va, newpte); } } /* * Sync I & D caches for executable pages. Do this only if the * target pmap belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (!is_kernel_pmap(pmap) && (pmap == &curproc->p_vmspace->vm_pmap) && (prot & VM_PROT_EXECUTE)) { mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte, npte; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not resident, we are * creating it here. */ if (va < VM_MAXUSER_ADDRESS) { pd_entry_t *pde; unsigned ptepindex; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ pde = pmap_pde(pmap, va); /* * If the page table page is mapped, we just * increment the hold count, and activate it. */ if (pde && *pde != 0) { mpte = PHYS_TO_VM_PAGE( MIPS_DIRECT_TO_PHYS(*pde)); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } pte = pmap_pte(pmap, va); if (pte_test(pte, PTE_V)) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, mpte, va, m)) { if (mpte != NULL) { pmap_unwire_ptp(pmap, va, mpte); mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ npte = PTE_RO | TLBLO_PA_TO_PFN(pa) | PTE_V; if ((m->oflags & VPO_UNMANAGED) == 0) npte |= PTE_MANAGED; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); if (is_kernel_pmap(pmap)) *pte = npte | PTE_G; else { *pte = npte; /* * Sync I & D caches. Do this only if the target pmap * belongs to the current process. Otherwise, an * unresolvable TLB miss may occur. */ if (pmap == &curproc->p_vmspace->vm_pmap) { va &= ~PAGE_MASK; mips_icache_sync_range(va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } } return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; if (i != 0) printf("%s: ERROR!!! More than one page of virtual address mapping not supported\n", __func__); if (MIPS_DIRECT_MAPPABLE(pa)) { va = MIPS_PHYS_TO_DIRECT(pa); } else { #ifndef __mips_n64 /* XXX : to be converted to new style */ int cpu; register_t intr; struct local_sysmaps *sysm; pt_entry_t *pte, npte; /* If this is used other than for dumps, we may need to leave * interrupts disasbled on return. If crash dumps don't work when * we get to this point, we might want to consider this (leaving things * disabled as a starting point ;-) */ intr = intr_disable(); cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; /* Since this is for the debugger, no locks or any other fun */ npte = TLBLO_PA_TO_PFN(pa) | PTE_C_CACHE | PTE_D | PTE_V | PTE_G; pte = pmap_pte(kernel_pmap, sysm->base); *pte = npte; sysm->valid1 = 1; pmap_update_page(kernel_pmap, sysm->base, npte); va = sysm->base; intr_restore(intr); #endif } return ((void *)va); } void pmap_kenter_temporary_free(vm_paddr_t pa) { #ifndef __mips_n64 /* XXX : to be converted to new style */ int cpu; register_t intr; struct local_sysmaps *sysm; #endif if (MIPS_DIRECT_MAPPABLE(pa)) { /* nothing to do for this case */ return; } #ifndef __mips_n64 /* XXX : to be converted to new style */ cpu = PCPU_GET(cpuid); sysm = &sysmap_lmem[cpu]; if (sysm->valid1) { pt_entry_t *pte; intr = intr_disable(); pte = pmap_pte(kernel_pmap, sysm->base); *pte = PTE_G; pmap_invalidate_page(kernel_pmap, sysm->base); intr_restore(intr); sysm->valid1 = 0; } #endif } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va_next; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == NULL) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V)) continue; if (!pte_test(pte, PTE_W)) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); pte_clear(pte, PTE_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); } else { va = pmap_lmem_map1(phys); bzero((caddr_t)va, PAGE_SIZE); mips_dcache_wbinv_range(va, PAGE_SIZE); pmap_lmem_unmap(); } } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(phys)) { va = MIPS_PHYS_TO_DIRECT(phys); bzero((char *)(caddr_t)va + off, size); mips_dcache_wbinv_range(va + off, size); } else { va = pmap_lmem_map1(phys); bzero((char *)va + off, size); mips_dcache_wbinv_range(va + off, size); pmap_lmem_unmap(); } } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. * * Use XKPHYS for 64 bit, and KSEG0 where possible for 32 bit. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { vm_offset_t va_src, va_dst; vm_paddr_t phys_src = VM_PAGE_TO_PHYS(src); vm_paddr_t phys_dst = VM_PAGE_TO_PHYS(dst); if (MIPS_DIRECT_MAPPABLE(phys_src) && MIPS_DIRECT_MAPPABLE(phys_dst)) { /* easy case, all can be accessed via KSEG0 */ /* * Flush all caches for VA that are mapped to this page * to make sure that data in SDRAM is up to date */ pmap_flush_pvcache(src); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(phys_dst), PAGE_SIZE); va_src = MIPS_PHYS_TO_DIRECT(phys_src); va_dst = MIPS_PHYS_TO_DIRECT(phys_dst); bcopy((caddr_t)va_src, (caddr_t)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); } else { va_src = pmap_lmem_map2(phys_src, phys_dst); va_dst = va_src + PAGE_SIZE; bcopy((void *)va_src, (void *)va_dst, PAGE_SIZE); mips_dcache_wbinv_range(va_dst, PAGE_SIZE); pmap_lmem_unmap(); } } int unmapped_buf_allowed; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { char *a_cp, *b_cp; vm_page_t a_m, b_m; vm_offset_t a_pg_offset, b_pg_offset; vm_paddr_t a_phys, b_phys; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_m = ma[a_offset >> PAGE_SHIFT]; a_phys = VM_PAGE_TO_PHYS(a_m); b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_m = mb[b_offset >> PAGE_SHIFT]; b_phys = VM_PAGE_TO_PHYS(b_m); if (MIPS_DIRECT_MAPPABLE(a_phys) && MIPS_DIRECT_MAPPABLE(b_phys)) { pmap_flush_pvcache(a_m); mips_dcache_wbinv_range_index( MIPS_PHYS_TO_DIRECT(b_phys), PAGE_SIZE); a_cp = (char *)MIPS_PHYS_TO_DIRECT(a_phys) + a_pg_offset; b_cp = (char *)MIPS_PHYS_TO_DIRECT(b_phys) + b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); } else { a_cp = (char *)pmap_lmem_map2(a_phys, b_phys); b_cp = (char *)a_cp + PAGE_SIZE; a_cp += a_pg_offset; b_cp += b_pg_offset; bcopy(a_cp, b_cp, cnt); mips_dcache_wbinv_range((vm_offset_t)b_cp, cnt); pmap_lmem_unmap(); } a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { #if defined(__mips_n64) return MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)); #else vm_paddr_t pa; struct local_sysmaps *sysm; pt_entry_t *pte, npte; pa = VM_PAGE_TO_PHYS(m); if (MIPS_DIRECT_MAPPABLE(pa)) { if (pmap_page_get_memattr(m) != VM_MEMATTR_WRITE_BACK) return (MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else return (MIPS_PHYS_TO_DIRECT(pa)); } critical_enter(); sysm = &sysmap_lmem[PCPU_GET(cpuid)]; KASSERT(sysm->valid1 == 0, ("pmap_quick_enter_page: PTE busy")); pte = pmap_pte(kernel_pmap, sysm->base); npte = TLBLO_PA_TO_PFN(pa) | PTE_D | PTE_V | PTE_G; PMAP_PTE_SET_CACHE_BITS(npte, pa, m); *pte = npte; sysm->valid1 = 1; return (sysm->base); #endif } void pmap_quick_remove_page(vm_offset_t addr) { mips_dcache_wbinv_range(addr, PAGE_SIZE); #if !defined(__mips_n64) struct local_sysmaps *sysm; pt_entry_t *pte; if (addr >= MIPS_KSEG0_START && addr < MIPS_KSEG0_END) return; sysm = &sysmap_lmem[PCPU_GET(cpuid)]; KASSERT(sysm->valid1 != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(sysm->base == addr, ("pmap_quick_remove_page: invalid address")); pte = pmap_pte(kernel_pmap, addr); *pte = PTE_G; tlb_invalidate_address(kernel_pmap, addr); sysm->valid1 = 0; critical_exit(); #endif } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; pv_entry_t pv; vm_page_t m; struct pv_chunk *pc, *npc; u_long inuse, bitmask; int allfree, bit, field, idx; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * sizeof(inuse) * NBBY + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va); KASSERT(pde != NULL && *pde != 0, ("pmap_remove_pages: pde")); pte = pmap_pde_to_pte(pde, pv->pv_va); if (!pte_test(pte, PTE_V)) panic("pmap_remove_pages: bad pte"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (pte_test(&tpte, PTE_W)) { allfree = 0; continue; } *pte = is_kernel_pmap(pmap) ? PTE_G : 0; m = PHYS_TO_VM_PAGE(TLBLO_PTE_TO_PA(tpte)); KASSERT(m != NULL, ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); /* * Update the vm_page_t clean and reference bits. */ if (pte_test(&tpte, PTE_D)) vm_page_dirty(m); /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, *pde); } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * pmap_testbit tests bits in pte's */ static boolean_t pmap_testbit(vm_page_t m, int bit) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; boolean_t rv = FALSE; if (m->oflags & VPO_UNMANAGED) return (rv); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); rv = pte_test(pte, bit); PMAP_UNLOCK(pmap); if (rv) break; } return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *pte; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_W)) count++; PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; pt_entry_t pbits, *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); KASSERT(pte != NULL && pte_test(pte, PTE_V), ("page on pv_list has no pte")); pbits = *pte; if (pte_test(&pbits, PTE_D)) { pte_clear(&pbits, PTE_D); vm_page_dirty(m); } pte_set(&pbits, PTE_RO); if (pbits != *pte) { *pte = pbits; pmap_update_page(pmap, pv->pv_va, pbits); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); if (m->md.pv_flags & PV_TABLE_REF) { rw_wlock(&pvh_global_lock); m->md.pv_flags &= ~PV_TABLE_REF; rw_wunlock(&pvh_global_lock); return (1); } return (0); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PTE_D set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_testbit(m, PTE_D); rw_wunlock(&pvh_global_lock); return (rv); } /* N/C */ /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && *pde != 0) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte == 0); } PMAP_UNLOCK(pmap); return (rv); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t va, va_next; vm_paddr_t pa; vm_page_t m; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being write protected. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_MANAGED | PTE_V)) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } pa = TLBLO_PTE_TO_PA(*pte); m = PHYS_TO_VM_PAGE(pa); m->md.pv_flags &= ~PV_TABLE_REF; if (pte_test(pte, PTE_D)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ vm_page_dirty(m); } else { pte_clear(pte, PTE_D); if (va == va_next) va = sva; } } else { /* * Unless PTE_D is set, any TLB entries * mapping "sva" don't allow write access, so * they needn't be invalidated. */ if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PTE_D set. * If the object containing the page is locked and the page is not * write busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, pv->pv_va); if (pte_test(pte, PTE_D)) { pte_clear(pte, PTE_D); pmap_update_page(pmap, pv->pv_va, *pte); } PMAP_UNLOCK(pmap); } rw_wunlock(&pvh_global_lock); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return ((m->md.pv_flags & PV_TABLE_REF) != 0); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. * * Use XKPHYS uncached for 64 bit, and KSEG1 where possible for 32 bit. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, offset; /* * KSEG1 maps only first 512M of phys address space. For * pa > 0x20000000 we should make proper mapping * using pmap_kenter. */ if (MIPS_DIRECT_MAPPABLE(pa + size - 1) && ma == VM_MEMATTR_UNCACHEABLE) return ((void *)MIPS_PHYS_TO_DIRECT_UNCACHED(pa)); else { offset = pa & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0;) { pmap_kenter_attr(tmpva, pa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } } return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return pmap_mapdev_attr(pa, size, VM_MEMATTR_UNCACHEABLE); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { #ifndef __mips_n64 vm_offset_t base, offset; /* If the address is within KSEG1 then there is nothing to do */ if (va >= MIPS_KSEG1_START && va <= MIPS_KSEG1_END) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(size + offset, PAGE_SIZE); kva_free(base, size); #endif } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *ptep, pte; vm_paddr_t pa; vm_page_t m; int val; PMAP_LOCK(pmap); retry: ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; if (!pte_test(&pte, PTE_V)) { val = 0; goto out; } val = MINCORE_INCORE; if (pte_test(&pte, PTE_D)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; pa = TLBLO_PTE_TO_PA(pte); if (pte_test(&pte, PTE_MANAGED)) { /* * This may falsely report the given address as * MINCORE_REFERENCED. Unfortunately, due to the lack of * per-PTE reference information, it is impossible to * determine if the address is MINCORE_REFERENCED. */ m = PHYS_TO_VM_PAGE(pa); - if ((vm_page_aflags(m) & PGA_REFERENCED) != 0) + if ((m->aflags & PGA_REFERENCED) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && pte_test(&pte, PTE_MANAGED)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else out: PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; struct proc *p = td->td_proc; u_int cpuid; critical_enter(); pmap = vmspace_pmap(p->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); if (oldpmap) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); pmap_asid_alloc(pmap); if (td == curthread) { PCPU_SET(segbase, pmap->pm_segtab); mips_wr_entryhi(pmap->pm_asid[cpuid].asid); } PCPU_SET(curpmap, pmap); critical_exit(); } static void pmap_sync_icache_one(void *arg __unused) { mips_icache_sync_all(); mips_dcache_wbinv_all(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { smp_rendezvous(NULL, pmap_sync_icache_one, NULL, NULL); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < PDRSIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((PDRSIZE - superpage_offset) & PDRMASK) < PDRSIZE || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef DDB DB_SHOW_COMMAND(ptable, ddb_pid_dump) { pmap_t pmap; struct thread *td = NULL; struct proc *p; int i, j, k; vm_paddr_t pa; vm_offset_t va; if (have_addr) { td = db_lookup_thread(addr, true); if (td == NULL) { db_printf("Invalid pid or tid"); return; } p = td->td_proc; if (p->p_vmspace == NULL) { db_printf("No vmspace for process"); return; } pmap = vmspace_pmap(p->p_vmspace); } else pmap = kernel_pmap; db_printf("pmap:%p segtab:%p asid:%x generation:%x\n", pmap, pmap->pm_segtab, pmap->pm_asid[0].asid, pmap->pm_asid[0].gen); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pdpe; pt_entry_t *pde; pt_entry_t pte; pdpe = (pd_entry_t *)pmap->pm_segtab[i]; if (pdpe == NULL) continue; db_printf("[%4d] %p\n", i, pdpe); #ifdef __mips_n64 for (j = 0; j < NPDEPG; j++) { pde = (pt_entry_t *)pdpe[j]; if (pde == NULL) continue; db_printf("\t[%4d] %p\n", j, pde); #else { j = 0; pde = (pt_entry_t *)pdpe; #endif for (k = 0; k < NPTEPG; k++) { pte = pde[k]; if (pte == 0 || !pte_test(&pte, PTE_V)) continue; pa = TLBLO_PTE_TO_PA(pte); va = ((u_long)i << SEGSHIFT) | (j << PDRSHIFT) | (k << PAGE_SHIFT); db_printf("\t\t[%04d] va: %p pte: %8jx pa:%jx\n", k, (void *)va, (uintmax_t)pte, (uintmax_t)pa); } } } } #endif /* * Allocate TLB address space tag (called ASID or TLBPID) and return it. * It takes almost as much or more time to search the TLB for a * specific ASID and flush those entries as it does to flush the entire TLB. * Therefore, when we allocate a new ASID, we just take the next number. When * we run out of numbers, we flush the TLB, increment the generation count * and start over. ASID zero is reserved for kernel use. */ static void pmap_asid_alloc(pmap) pmap_t pmap; { if (pmap->pm_asid[PCPU_GET(cpuid)].asid != PMAP_ASID_RESERVED && pmap->pm_asid[PCPU_GET(cpuid)].gen == PCPU_GET(asid_generation)); else { if (PCPU_GET(next_asid) == pmap_max_asid) { tlb_invalidate_all_user(NULL); PCPU_SET(asid_generation, (PCPU_GET(asid_generation) + 1) & ASIDGEN_MASK); if (PCPU_GET(asid_generation) == 0) { PCPU_SET(asid_generation, 1); } PCPU_SET(next_asid, 1); /* 0 means invalid */ } pmap->pm_asid[PCPU_GET(cpuid)].asid = PCPU_GET(next_asid); pmap->pm_asid[PCPU_GET(cpuid)].gen = PCPU_GET(asid_generation); PCPU_SET(next_asid, PCPU_GET(next_asid) + 1); } } static pt_entry_t init_pte_prot(vm_page_t m, vm_prot_t access, vm_prot_t prot) { pt_entry_t rw; if (!(prot & VM_PROT_WRITE)) rw = PTE_V | PTE_RO; else if ((m->oflags & VPO_UNMANAGED) == 0) { if ((access & VM_PROT_WRITE) != 0) rw = PTE_V | PTE_D; else rw = PTE_V; } else /* Needn't emulate a modified bit for unmanaged pages. */ rw = PTE_V | PTE_D; return (rw); } /* * pmap_emulate_modified : do dirty bit emulation * * On SMP, update just the local TLB, other CPUs will update their * TLBs from PTE lazily, if they get the exception. * Returns 0 in case of sucess, 1 if the page is read only and we * need to fault. */ int pmap_emulate_modified(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (pte == NULL) panic("pmap_emulate_modified: can't find PTE"); #ifdef SMP /* It is possible that some other CPU changed m-bit */ if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) { tlb_update(pmap, va, *pte); PMAP_UNLOCK(pmap); return (0); } #else if (!pte_test(pte, PTE_V) || pte_test(pte, PTE_D)) panic("pmap_emulate_modified: invalid pte"); #endif if (pte_test(pte, PTE_RO)) { PMAP_UNLOCK(pmap); return (1); } pte_set(pte, PTE_D); tlb_update(pmap, va, *pte); if (!pte_test(pte, PTE_MANAGED)) panic("pmap_emulate_modified: unmanaged page"); PMAP_UNLOCK(pmap); return (0); } /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { int mapped; /* * First, the direct-mapped regions. */ #if defined(__mips_n64) if (va >= MIPS_XKPHYS_START && va < MIPS_XKPHYS_END) return (MIPS_XKPHYS_TO_PHYS(va)); #endif if (va >= MIPS_KSEG0_START && va < MIPS_KSEG0_END) return (MIPS_KSEG0_TO_PHYS(va)); if (va >= MIPS_KSEG1_START && va < MIPS_KSEG1_END) return (MIPS_KSEG1_TO_PHYS(va)); /* * User virtual addresses. */ if (va < VM_MAXUSER_ADDRESS) { pt_entry_t *ptep; if (curproc && curproc->p_vmspace) { ptep = pmap_pte(&curproc->p_vmspace->vm_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } return (0); } } /* * Should be kernel virtual here, otherwise fail */ mapped = (va >= MIPS_KSEG2_START || va < MIPS_KSEG2_END); #if defined(__mips_n64) mapped = mapped || (va >= MIPS_XKSEG_START || va < MIPS_XKSEG_END); #endif /* * Kernel virtual. */ if (mapped) { pt_entry_t *ptep; /* Is the kernel pmap initialized? */ if (!CPU_EMPTY(&kernel_pmap->pm_active)) { /* It's inside the virtual address range */ ptep = pmap_pte(kernel_pmap, va); if (ptep) { return (TLBLO_PTE_TO_PA(*ptep) | (va & PAGE_MASK)); } } return (0); } panic("%s for unknown address space %p.", __func__, (void *)va); } void pmap_flush_pvcache(vm_page_t m) { pv_entry_t pv; if (m != NULL) { for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { mips_dcache_wbinv_range_index(pv->pv_va, PAGE_SIZE); } } } void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { /* * It appears that this function can only be called before any mappings * for the page are established. If this ever changes, this code will * need to walk the pv_list and make each of the existing mappings * uncacheable, being careful to sync caches and PTEs (and maybe * invalidate TLB?) for any current mapping it modifies. */ if (TAILQ_FIRST(&m->md.pv_list) != NULL) panic("Can't change memattr on page with existing mappings"); /* Clean memattr portion of pv_flags */ m->md.pv_flags &= ~PV_MEMATTR_MASK; m->md.pv_flags |= (ma << PV_MEMATTR_SHIFT) & PV_MEMATTR_MASK; } static __inline void pmap_pte_attr(pt_entry_t *pte, vm_memattr_t ma) { u_int npte; npte = *(u_int *)pte; npte &= ~PTE_C_MASK; npte |= PTE_C(ma); *pte = npte; } int pmap_change_attr(vm_offset_t sva, vm_size_t size, vm_memattr_t ma) { pd_entry_t *pde, *pdpe; pt_entry_t *pte; vm_offset_t ova, eva, va, va_next; pmap_t pmap; ova = sva; eva = sva + size; if (eva < sva) return (EINVAL); pmap = kernel_pmap; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pdpe = pmap_segmap(pmap, sva); #ifdef __mips_n64 if (*pdpe == 0) { va_next = (sva + NBSEG) & ~SEGMASK; if (va_next < sva) va_next = eva; continue; } #endif va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if (*pde == NULL) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (!pte_test(pte, PTE_V) || pte_cache_bits(pte) == ma) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; pmap_pte_attr(pte, ma); } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); /* Flush caches to be in the safe side */ mips_dcache_wbinv_range(ova, size); return 0; } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { switch (mode) { case VM_MEMATTR_UNCACHEABLE: case VM_MEMATTR_WRITE_BACK: #ifdef MIPS_CCA_WC case VM_MEMATTR_WRITE_COMBINING: #endif return (TRUE); default: return (FALSE); } } Index: head/sys/powerpc/aim/mmu_oea.c =================================================================== --- head/sys/powerpc/aim/mmu_oea.c (revision 352406) +++ head/sys/powerpc/aim/mmu_oea.c (revision 352407) @@ -1,2778 +1,2777 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-4-Clause * * Copyright (c) 2001 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Matt Thomas of Allegro Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $ */ /*- * Copyright (C) 2001 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" #define MOEA_DEBUG #define TODO panic("%s: not implemented", __func__); #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_SR(vsid) ((vsid) & 0xf) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) struct ofw_map { vm_offset_t om_va; vm_size_t om_len; vm_offset_t om_pa; u_int om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz; static struct ofw_map *translations; /* * Lock for the pteg and pvo tables. */ struct mtx moea_table_mutex; struct mtx moea_vsid_mutex; /* tlbie instruction synchronization */ static struct mtx tlbie_mtx; /* * PTEG data. */ static struct pteg *moea_pteg_table; u_int moea_pteg_count; u_int moea_pteg_mask; /* * PVO data. */ struct pvo_head *moea_pvo_table; /* pvo entries by pteg index */ struct pvo_head moea_pvo_kunmanaged = LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged); /* list of unmanaged pages */ static struct rwlock_padalign pvh_global_lock; uma_zone_t moea_upvo_zone; /* zone for pvo entries for unmanaged pages */ uma_zone_t moea_mpvo_zone; /* zone for pvo entries for managed pages */ #define BPVO_POOL_SIZE 32768 static struct pvo_entry *moea_bpvo_pool; static int moea_bpvo_pool_index = 0; #define VSID_NBPW (sizeof(u_int32_t) * 8) static u_int moea_vsid_bitmap[NPMAPS / VSID_NBPW]; static boolean_t moea_initialized = FALSE; /* * Statistics. */ u_int moea_pte_valid = 0; u_int moea_pte_overflow = 0; u_int moea_pte_replacements = 0; u_int moea_pvo_entries = 0; u_int moea_pvo_enter_calls = 0; u_int moea_pvo_remove_calls = 0; u_int moea_pte_spills = 0; SYSCTL_INT(_machdep, OID_AUTO, moea_pte_valid, CTLFLAG_RD, &moea_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pte_overflow, CTLFLAG_RD, &moea_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pte_replacements, CTLFLAG_RD, &moea_pte_replacements, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_entries, CTLFLAG_RD, &moea_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_enter_calls, CTLFLAG_RD, &moea_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_remove_calls, CTLFLAG_RD, &moea_pvo_remove_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea_pte_spills, CTLFLAG_RD, &moea_pte_spills, 0, ""); /* * Allocate physical memory for use in moea_bootstrap. */ static vm_offset_t moea_bootstrap_alloc(vm_size_t, u_int); /* * PTE calls. */ static int moea_pte_insert(u_int, struct pte *); /* * PVO calls. */ static int moea_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *, vm_offset_t, vm_paddr_t, u_int, int); static void moea_pvo_remove(struct pvo_entry *, int); static struct pvo_entry *moea_pvo_find_va(pmap_t, vm_offset_t, int *); static struct pte *moea_pvo_to_pte(const struct pvo_entry *, int); /* * Utility routines. */ static int moea_enter_locked(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); static void moea_syncicache(vm_paddr_t, vm_size_t); static boolean_t moea_query_bit(vm_page_t, int); static u_int moea_clear_bit(vm_page_t, int); static void moea_kremove(mmu_t, vm_offset_t); int moea_pte_spill(vm_offset_t); /* * Kernel MMU interface */ void moea_clear_modify(mmu_t, vm_page_t); void moea_copy_page(mmu_t, vm_page_t, vm_page_t); void moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t); void moea_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea_init(mmu_t); boolean_t moea_is_modified(mmu_t, vm_page_t); boolean_t moea_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea_is_referenced(mmu_t, vm_page_t); int moea_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea_page_exists_quick(mmu_t, pmap_t, vm_page_t); void moea_page_init(mmu_t, vm_page_t); int moea_page_wired_mappings(mmu_t, vm_page_t); void moea_pinit(mmu_t, pmap_t); void moea_pinit0(mmu_t, pmap_t); void moea_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea_qremove(mmu_t, vm_offset_t, int); void moea_release(mmu_t, pmap_t); void moea_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea_remove_all(mmu_t, vm_page_t); void moea_remove_write(mmu_t, vm_page_t); void moea_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea_zero_page(mmu_t, vm_page_t); void moea_zero_page_area(mmu_t, vm_page_t, int, int); void moea_activate(mmu_t, struct thread *); void moea_deactivate(mmu_t, struct thread *); void moea_cpu_bootstrap(mmu_t, int); void moea_bootstrap(mmu_t, vm_offset_t, vm_offset_t); void *moea_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); void moea_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea_kextract(mmu_t, vm_offset_t); void moea_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t); void moea_kenter(mmu_t, vm_offset_t, vm_paddr_t); void moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma); boolean_t moea_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea_scan_init(mmu_t mmu); vm_offset_t moea_quick_enter_page(mmu_t mmu, vm_page_t m); void moea_quick_remove_page(mmu_t mmu, vm_offset_t addr); static int moea_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static mmu_method_t moea_methods[] = { MMUMETHOD(mmu_clear_modify, moea_clear_modify), MMUMETHOD(mmu_copy_page, moea_copy_page), MMUMETHOD(mmu_copy_pages, moea_copy_pages), MMUMETHOD(mmu_enter, moea_enter), MMUMETHOD(mmu_enter_object, moea_enter_object), MMUMETHOD(mmu_enter_quick, moea_enter_quick), MMUMETHOD(mmu_extract, moea_extract), MMUMETHOD(mmu_extract_and_hold, moea_extract_and_hold), MMUMETHOD(mmu_init, moea_init), MMUMETHOD(mmu_is_modified, moea_is_modified), MMUMETHOD(mmu_is_prefaultable, moea_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea_is_referenced), MMUMETHOD(mmu_ts_referenced, moea_ts_referenced), MMUMETHOD(mmu_map, moea_map), MMUMETHOD(mmu_page_exists_quick,moea_page_exists_quick), MMUMETHOD(mmu_page_init, moea_page_init), MMUMETHOD(mmu_page_wired_mappings,moea_page_wired_mappings), MMUMETHOD(mmu_pinit, moea_pinit), MMUMETHOD(mmu_pinit0, moea_pinit0), MMUMETHOD(mmu_protect, moea_protect), MMUMETHOD(mmu_qenter, moea_qenter), MMUMETHOD(mmu_qremove, moea_qremove), MMUMETHOD(mmu_release, moea_release), MMUMETHOD(mmu_remove, moea_remove), MMUMETHOD(mmu_remove_all, moea_remove_all), MMUMETHOD(mmu_remove_write, moea_remove_write), MMUMETHOD(mmu_sync_icache, moea_sync_icache), MMUMETHOD(mmu_unwire, moea_unwire), MMUMETHOD(mmu_zero_page, moea_zero_page), MMUMETHOD(mmu_zero_page_area, moea_zero_page_area), MMUMETHOD(mmu_activate, moea_activate), MMUMETHOD(mmu_deactivate, moea_deactivate), MMUMETHOD(mmu_page_set_memattr, moea_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea_quick_remove_page), /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, moea_bootstrap), MMUMETHOD(mmu_cpu_bootstrap, moea_cpu_bootstrap), MMUMETHOD(mmu_mapdev_attr, moea_mapdev_attr), MMUMETHOD(mmu_mapdev, moea_mapdev), MMUMETHOD(mmu_unmapdev, moea_unmapdev), MMUMETHOD(mmu_kextract, moea_kextract), MMUMETHOD(mmu_kenter, moea_kenter), MMUMETHOD(mmu_kenter_attr, moea_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea_scan_init), MMUMETHOD(mmu_dumpsys_map, moea_dumpsys_map), MMUMETHOD(mmu_map_user_ptr, moea_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, moea_decode_kernel_ptr), { 0, 0 } }; MMU_DEF(oea_mmu, MMU_TYPE_OEA, moea_methods, 0); static __inline uint32_t moea_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint32_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (PTE_I | PTE_G); case VM_MEMATTR_CACHEABLE: return (PTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (PTE_I); case VM_MEMATTR_WRITE_THROUGH: return (PTE_W | PTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = PTE_I | PTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo = PTE_M; break; } } return pte_lo; } static void tlbie(vm_offset_t va) { mtx_lock_spin(&tlbie_mtx); __asm __volatile("ptesync"); __asm __volatile("tlbie %0" :: "r"(va)); __asm __volatile("eieio; tlbsync; ptesync"); mtx_unlock_spin(&tlbie_mtx); } static void tlbia(void) { vm_offset_t va; for (va = 0; va < 0x00040000; va += 0x00001000) { __asm __volatile("tlbie %0" :: "r"(va)); powerpc_sync(); } __asm __volatile("tlbsync"); powerpc_sync(); } static __inline int va_to_sr(u_int *sr, vm_offset_t va) { return (sr[(uintptr_t)va >> ADDR_SR_SHFT]); } static __inline u_int va_to_pteg(u_int sr, vm_offset_t addr) { u_int hash; hash = (sr & SR_VSID_MASK) ^ (((u_int)addr & ADDR_PIDX) >> ADDR_PIDX_SHFT); return (hash & moea_pteg_mask); } static __inline struct pvo_head * vm_page_to_pvoh(vm_page_t m) { return (&m->md.mdpg_pvoh); } static __inline void moea_attr_clear(vm_page_t m, int ptebit) { rw_assert(&pvh_global_lock, RA_WLOCKED); m->md.mdpg_attrs &= ~ptebit; } static __inline int moea_attr_fetch(vm_page_t m) { return (m->md.mdpg_attrs); } static __inline void moea_attr_save(vm_page_t m, int ptebit) { rw_assert(&pvh_global_lock, RA_WLOCKED); m->md.mdpg_attrs |= ptebit; } static __inline int moea_pte_compare(const struct pte *pt, const struct pte *pvo_pt) { if (pt->pte_hi == pvo_pt->pte_hi) return (1); return (0); } static __inline int moea_pte_match(struct pte *pt, u_int sr, vm_offset_t va, int which) { return (pt->pte_hi & ~PTE_VALID) == (((sr & SR_VSID_MASK) << PTE_VSID_SHFT) | ((va >> ADDR_API_SHFT) & PTE_API) | which); } static __inline void moea_pte_create(struct pte *pt, u_int sr, vm_offset_t va, u_int pte_lo) { mtx_assert(&moea_table_mutex, MA_OWNED); /* * Construct a PTE. Default to IMB initially. Valid bit only gets * set when the real pte is set in memory. * * Note: Don't set the valid bit for correct operation of tlb update. */ pt->pte_hi = ((sr & SR_VSID_MASK) << PTE_VSID_SHFT) | (((va & ADDR_PIDX) >> ADDR_API_SHFT) & PTE_API); pt->pte_lo = pte_lo; } static __inline void moea_pte_synch(struct pte *pt, struct pte *pvo_pt) { mtx_assert(&moea_table_mutex, MA_OWNED); pvo_pt->pte_lo |= pt->pte_lo & (PTE_REF | PTE_CHG); } static __inline void moea_pte_clear(struct pte *pt, vm_offset_t va, int ptebit) { mtx_assert(&moea_table_mutex, MA_OWNED); /* * As shown in Section 7.6.3.2.3 */ pt->pte_lo &= ~ptebit; tlbie(va); } static __inline void moea_pte_set(struct pte *pt, struct pte *pvo_pt) { mtx_assert(&moea_table_mutex, MA_OWNED); pvo_pt->pte_hi |= PTE_VALID; /* * Update the PTE as defined in section 7.6.3.1. * Note that the REF/CHG bits are from pvo_pt and thus should have * been saved so this routine can restore them (if desired). */ pt->pte_lo = pvo_pt->pte_lo; powerpc_sync(); pt->pte_hi = pvo_pt->pte_hi; powerpc_sync(); moea_pte_valid++; } static __inline void moea_pte_unset(struct pte *pt, struct pte *pvo_pt, vm_offset_t va) { mtx_assert(&moea_table_mutex, MA_OWNED); pvo_pt->pte_hi &= ~PTE_VALID; /* * Force the reg & chg bits back into the PTEs. */ powerpc_sync(); /* * Invalidate the pte. */ pt->pte_hi &= ~PTE_VALID; tlbie(va); /* * Save the reg & chg bits. */ moea_pte_synch(pt, pvo_pt); moea_pte_valid--; } static __inline void moea_pte_change(struct pte *pt, struct pte *pvo_pt, vm_offset_t va) { /* * Invalidate the PTE */ moea_pte_unset(pt, pvo_pt, va); moea_pte_set(pt, pvo_pt); } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } void moea_cpu_bootstrap(mmu_t mmup, int ap) { u_int sdr; int i; if (ap) { powerpc_sync(); __asm __volatile("mtdbatu 0,%0" :: "r"(battable[0].batu)); __asm __volatile("mtdbatl 0,%0" :: "r"(battable[0].batl)); isync(); __asm __volatile("mtibatu 0,%0" :: "r"(battable[0].batu)); __asm __volatile("mtibatl 0,%0" :: "r"(battable[0].batl)); isync(); } __asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu)); __asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl)); isync(); __asm __volatile("mtibatu 1,%0" :: "r"(0)); __asm __volatile("mtdbatu 2,%0" :: "r"(0)); __asm __volatile("mtibatu 2,%0" :: "r"(0)); __asm __volatile("mtdbatu 3,%0" :: "r"(0)); __asm __volatile("mtibatu 3,%0" :: "r"(0)); isync(); for (i = 0; i < 16; i++) mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]); powerpc_sync(); sdr = (u_int)moea_pteg_table | (moea_pteg_mask >> 10); __asm __volatile("mtsdr1 %0" :: "r"(sdr)); isync(); tlbia(); } void moea_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen, mmu; int sz; int i, j; vm_size_t size, physsz, hwphyssz; vm_offset_t pa, va, off; void *dpcpu; register_t msr; /* * Set up BAT0 to map the lowest 256 MB area */ battable[0x0].batl = BATL(0x00000000, BAT_M, BAT_PP_RW); battable[0x0].batu = BATU(0x00000000, BAT_BL_256M, BAT_Vs); /* * Map PCI memory space. */ battable[0x8].batl = BATL(0x80000000, BAT_I|BAT_G, BAT_PP_RW); battable[0x8].batu = BATU(0x80000000, BAT_BL_256M, BAT_Vs); battable[0x9].batl = BATL(0x90000000, BAT_I|BAT_G, BAT_PP_RW); battable[0x9].batu = BATU(0x90000000, BAT_BL_256M, BAT_Vs); battable[0xa].batl = BATL(0xa0000000, BAT_I|BAT_G, BAT_PP_RW); battable[0xa].batu = BATU(0xa0000000, BAT_BL_256M, BAT_Vs); battable[0xb].batl = BATL(0xb0000000, BAT_I|BAT_G, BAT_PP_RW); battable[0xb].batu = BATU(0xb0000000, BAT_BL_256M, BAT_Vs); /* * Map obio devices. */ battable[0xf].batl = BATL(0xf0000000, BAT_I|BAT_G, BAT_PP_RW); battable[0xf].batu = BATU(0xf0000000, BAT_BL_256M, BAT_Vs); /* * Use an IBAT and a DBAT to map the bottom segment of memory * where we are. Turn off instruction relocation temporarily * to prevent faults while reprogramming the IBAT. */ msr = mfmsr(); mtmsr(msr & ~PSL_IR); __asm (".balign 32; \n" "mtibatu 0,%0; mtibatl 0,%1; isync; \n" "mtdbatu 0,%0; mtdbatl 0,%1; isync" :: "r"(battable[0].batu), "r"(battable[0].batl)); mtmsr(msr); /* map pci space */ __asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu)); __asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl)); isync(); /* set global direct map flag */ hw_direct_map = 1; mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea_bootstrap: physical memory"); for (i = 0; i < pregions_sz; i++) { vm_offset_t pa; vm_offset_t end; CTR3(KTR_PMAP, "physregion: %#x - %#x (%#x)", pregions[i].mr_start, pregions[i].mr_start + pregions[i].mr_size, pregions[i].mr_size); /* * Install entries into the BAT table to allow all * of physmem to be convered by on-demand BAT entries. * The loop will sometimes set the same battable element * twice, but that's fine since they won't be used for * a while yet. */ pa = pregions[i].mr_start & 0xf0000000; end = pregions[i].mr_start + pregions[i].mr_size; do { u_int n = pa >> ADDR_SR_SHFT; battable[n].batl = BATL(pa, BAT_M, BAT_PP_RW); battable[n].batu = BATU(pa, BAT_BL_256M, BAT_Vs); pa += SEGMENT_LENGTH; } while (pa < end); } if (PHYS_AVAIL_ENTRIES < regions_sz) panic("moea_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (kernelstart >= phys_avail[j] && kernelstart < phys_avail[j+1]) { if (kernelend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelstart & ~PAGE_MASK; } if (kernelend >= phys_avail[j] && kernelend < phys_avail[j+1]) { if (kernelstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE; } } physmem = btoc(physsz); /* * Allocate PTEG table. */ #ifdef PTEGCOUNT moea_pteg_count = PTEGCOUNT; #else moea_pteg_count = 0x1000; while (moea_pteg_count < physmem) moea_pteg_count <<= 1; moea_pteg_count >>= 1; #endif /* PTEGCOUNT */ size = moea_pteg_count * sizeof(struct pteg); CTR2(KTR_PMAP, "moea_bootstrap: %d PTEGs, %d bytes", moea_pteg_count, size); moea_pteg_table = (struct pteg *)moea_bootstrap_alloc(size, size); CTR1(KTR_PMAP, "moea_bootstrap: PTEG table at %p", moea_pteg_table); bzero((void *)moea_pteg_table, moea_pteg_count * sizeof(struct pteg)); moea_pteg_mask = moea_pteg_count - 1; /* * Allocate pv/overflow lists. */ size = sizeof(struct pvo_head) * moea_pteg_count; moea_pvo_table = (struct pvo_head *)moea_bootstrap_alloc(size, PAGE_SIZE); CTR1(KTR_PMAP, "moea_bootstrap: PVO table at %p", moea_pvo_table); for (i = 0; i < moea_pteg_count; i++) LIST_INIT(&moea_pvo_table[i]); /* * Initialize the lock that synchronizes access to the pteg and pvo * tables. */ mtx_init(&moea_table_mutex, "pmap table", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&moea_vsid_mutex, "VSID table", NULL, MTX_DEF); mtx_init(&tlbie_mtx, "tlbie", NULL, MTX_SPIN); /* * Initialise the unmanaged pvo pool. */ moea_bpvo_pool = (struct pvo_entry *)moea_bootstrap_alloc( BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0); moea_bpvo_pool_index = 0; /* * Make sure kernel vsid is allocated as well as VSID 0. */ moea_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea_vsid_bitmap[0] |= 1; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /* * Set up the Open Firmware mappings */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1 && (mmu = OF_instance_to_package(mmui)) != -1 && (sz = OF_getproplen(mmu, "translations")) != -1) { translations = NULL; for (i = 0; phys_avail[i] != 0; i += 2) { if (phys_avail[i + 1] >= sz) { translations = (struct ofw_map *)phys_avail[i]; break; } } if (translations == NULL) panic("moea_bootstrap: no space to copy translations"); bzero(translations, sz); if (OF_getprop(mmu, "translations", translations, sz) == -1) panic("moea_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea_bootstrap: translations"); sz /= sizeof(*translations); qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x", translations[i].om_pa, translations[i].om_va, translations[i].om_len); /* * If the mapping is 1:1, let the RAM and device * on-demand BAT tables take care of the translation. */ if (translations[i].om_va == translations[i].om_pa) continue; /* Enter the pages */ for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) moea_kenter(mmup, translations[i].om_va + off, translations[i].om_pa + off); } } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); moea_cpu_bootstrap(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, 0); } /* * Activate a user pmap. The pmap must be activated before it's address * space can be accessed in any way. */ void moea_activate(mmu_t mmu, struct thread *td) { pmap_t pm, pmr; /* * Load all the data we need up front to encourage the compiler to * not issue any loads while we have interrupts disabled below. */ pm = &td->td_proc->p_vmspace->vm_pmap; pmr = pm->pmap_phys; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); PCPU_SET(curpmap, pmr); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); } void moea_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); PCPU_SET(curpmap, NULL); } void moea_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } void moea_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); bcopy((void *)src, (void *)dst, PAGE_SIZE); } void moea_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Zero a page of physical memory by temporarily mapping it into the tlb. */ void moea_zero_page(mmu_t mmu, vm_page_t m) { vm_offset_t off, pa = VM_PAGE_TO_PHYS(m); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(pa + off)); } void moea_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_offset_t pa = VM_PAGE_TO_PHYS(m); void *va = (void *)(pa + off); bzero(va, size); } vm_offset_t moea_quick_enter_page(mmu_t mmu, vm_page_t m) { return (VM_PAGE_TO_PHYS(m)); } void moea_quick_remove_page(mmu_t mmu, vm_offset_t addr) { } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { int error; for (;;) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); error = moea_enter_locked(pmap, va, m, prot, flags, psind); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); if (error != ENOMEM) return (KERN_SUCCESS); if ((flags & PMAP_ENTER_NOSLEEP) != 0) return (KERN_RESOURCE_SHORTAGE); VM_OBJECT_ASSERT_UNLOCKED(m->object); vm_wait(NULL); } } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. * * The global pvh and pmap must be locked. */ static int moea_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct pvo_head *pvo_head; uma_zone_t zone; u_int pte_lo, pvo_flags; int error; if (pmap_bootstrapped) rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); if ((m->oflags & VPO_UNMANAGED) != 0 || !moea_initialized) { pvo_head = &moea_pvo_kunmanaged; zone = moea_upvo_zone; pvo_flags = 0; } else { pvo_head = vm_page_to_pvoh(m); zone = moea_mpvo_zone; pvo_flags = PVO_MANAGED; } pte_lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); if (prot & VM_PROT_WRITE) { pte_lo |= PTE_BW; if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else pte_lo |= PTE_BR; if ((flags & PMAP_ENTER_WIRED) != 0) pvo_flags |= PVO_WIRED; error = moea_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m), pte_lo, pvo_flags); /* * Flush the real page from the instruction cache. This has be done * for all user mappings to prevent information leakage via the * instruction cache. moea_pvo_enter() returns ENOENT for the first * mapping for a page. */ if (pmap != kernel_pmap && error == ENOENT && (pte_lo & (PTE_I | PTE_G)) == 0) moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE); return (error); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea_enter_locked(pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), 0, 0); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } void moea_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), 0, 0); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } vm_paddr_t moea_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL); if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID) && ((pvo->pvo_pte.pte.pte_lo & PTE_PP) == PTE_RW || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte.pte_lo & PTE_RPGN); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } void moea_init(mmu_t mmu) { moea_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); moea_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); moea_initialized = TRUE; } boolean_t moea_is_referenced(mmu_t mmu, vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = moea_query_bit(m, PTE_REF); rw_wunlock(&pvh_global_lock); return (rv); } boolean_t moea_is_modified(mmu_t mmu, vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PTE_CHG set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = moea_query_bit(m, PTE_CHG); rw_wunlock(&pvh_global_lock); return (rv); } boolean_t moea_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv; PMAP_LOCK(pmap); pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL); rv = pvo == NULL || (pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0; PMAP_UNLOCK(pmap); return (rv); } void moea_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("moea_clear_modify: page %p is exclusive busy", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PTE_CHG * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); moea_clear_bit(m, PTE_CHG); rw_wunlock(&pvh_global_lock); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; struct pte *pt; pmap_t pmap; u_int lo; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); lo = moea_attr_fetch(m); powerpc_sync(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if ((pvo->pvo_pte.pte.pte_lo & PTE_PP) != PTE_BR) { pt = moea_pvo_to_pte(pvo, -1); pvo->pvo_pte.pte.pte_lo &= ~PTE_PP; pvo->pvo_pte.pte.pte_lo |= PTE_BR; if (pt != NULL) { moea_pte_synch(pt, &pvo->pvo_pte.pte); lo |= pvo->pvo_pte.pte.pte_lo; pvo->pvo_pte.pte.pte_lo &= ~PTE_CHG; moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); mtx_unlock(&moea_table_mutex); } } PMAP_UNLOCK(pmap); } if ((lo & PTE_CHG) != 0) { moea_attr_clear(m, PTE_CHG); vm_page_dirty(m); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * moea_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea_ts_referenced(mmu_t mmu, vm_page_t m) { int count; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_ts_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); count = moea_clear_bit(m, PTE_REF); rw_wunlock(&pvh_global_lock); return (count); } /* * Modify the WIMG settings of all mappings for a page. */ void moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; struct pvo_head *pvo_head; struct pte *pt; pmap_t pmap; u_int lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } rw_wlock(&pvh_global_lock); pvo_head = vm_page_to_pvoh(m); lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), ma); LIST_FOREACH(pvo, pvo_head, pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); pt = moea_pvo_to_pte(pvo, -1); pvo->pvo_pte.pte.pte_lo &= ~PTE_WIMG; pvo->pvo_pte.pte.pte_lo |= lo; if (pt != NULL) { moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); if (pvo->pvo_pmap == kernel_pmap) isync(); } mtx_unlock(&moea_table_mutex); PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; rw_wunlock(&pvh_global_lock); } /* * Map a wired page into kernel virtual address space. */ void moea_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } void moea_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { u_int pte_lo; int error; #if 0 if (va < VM_MIN_KERNEL_ADDRESS) panic("moea_kenter: attempt to enter non-kernel address %#x", va); #endif pte_lo = moea_calc_wimg(pa, ma); PMAP_LOCK(kernel_pmap); error = moea_pvo_enter(kernel_pmap, moea_upvo_zone, &moea_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED); if (error != 0 && error != ENOENT) panic("moea_kenter: failed to enter va %#x pa %#x: %d", va, pa, error); PMAP_UNLOCK(kernel_pmap); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Allow direct mappings on 32-bit OEA */ if (va < VM_MIN_KERNEL_ADDRESS) { return (va); } PMAP_LOCK(kernel_pmap); pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); KASSERT(pvo != NULL, ("moea_kextract: no addr found")); pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea_kremove(mmu_t mmu, vm_offset_t va) { moea_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ int moea_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; register_t vsid; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); vsid = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ vsid |= SR_N; /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == vsid) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = vsid; __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(vsid)); return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. We cannot and therefore do not; *virt is updated with the * first usable address after the mapped region. */ vm_offset_t moea_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; sva = *virt; va = sva; for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; rw_wlock(&pvh_global_lock); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } void moea_page_init(mmu_t mmu __unused, vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & PVO_WIRED) != 0) count++; rw_wunlock(&pvh_global_lock); return (count); } static u_int moea_vsidcontext; void moea_pinit(mmu_t mmu, pmap_t pmap) { int i, mask; u_int entropy; KASSERT((int)pmap < VM_MIN_KERNEL_ADDRESS, ("moea_pinit: virt pmap")); RB_INIT(&pmap->pmap_pvo); entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); if ((pmap->pmap_phys = (pmap_t)moea_kextract(mmu, (vm_offset_t)pmap)) == NULL) { pmap->pmap_phys = pmap; } mtx_lock(&moea_vsid_mutex); /* * Allocate some segment registers for this pmap. */ for (i = 0; i < NPMAPS; i += VSID_NBPW) { u_int hash, n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea_vsidcontext = (moea_vsidcontext * 0x1105) + entropy; hash = moea_vsidcontext & (NPMAPS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea_vsidcontext & 0xfffff); if (moea_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea_vsid_bitmap[n] == 0xffffffff) { entropy = (moea_vsidcontext >> 20); continue; } i = ffs(~moea_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(0xfffff, VSID_NBPW); hash |= i; } KASSERT(!(moea_vsid_bitmap[n] & mask), ("Allocating in-use VSID group %#x\n", hash)); moea_vsid_bitmap[n] |= mask; for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); mtx_unlock(&moea_vsid_mutex); return; } mtx_unlock(&moea_vsid_mutex); panic("moea_pinit: out of segments"); } /* * Initialize the pmap associated with process 0. */ void moea_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ void moea_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; struct pte *pt; KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea_remove(mmu, pm, sva, eva); return; } rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * Grab the PTE pointer before we diddle with the cached PTE * copy. */ pt = moea_pvo_to_pte(pvo, -1); /* * Change the protection of the page. */ pvo->pvo_pte.pte.pte_lo &= ~PTE_PP; pvo->pvo_pte.pte.pte_lo |= PTE_BR; /* * If the PVO is in the page table, update that pte as well. */ if (pt != NULL) { moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); mtx_unlock(&moea_table_mutex); } } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { moea_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea_qenter. */ void moea_qremove(mmu_t mmu, vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { moea_kremove(mmu, va); va += PAGE_SIZE; } } void moea_release(mmu_t mmu, pmap_t pmap) { int idx, mask; /* * Free segment register's VSID */ if (pmap->pm_sr[0] == 0) panic("moea_release"); mtx_lock(&moea_vsid_mutex); idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; moea_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea_vsid_mutex); } /* * Remove the given range of addresses from the specified map. */ void moea_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; rw_wlock(&pvh_global_lock); PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea_pvo_remove(pvo, -1); } PMAP_UNLOCK(pm); rw_wunlock(&pvh_global_lock); } /* * Remove physical page from all pmaps in which it resides. moea_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_head *pvo_head; struct pvo_entry *pvo, *next_pvo; pmap_t pmap; rw_wlock(&pvh_global_lock); pvo_head = vm_page_to_pvoh(m); for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) { next_pvo = LIST_NEXT(pvo, pvo_vlink); pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); moea_pvo_remove(pvo, -1); PMAP_UNLOCK(pmap); } - if ((vm_page_aflags(m) & PGA_WRITEABLE) != 0 && - moea_query_bit(m, PTE_CHG)) { + if ((m->aflags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) { moea_attr_clear(m, PTE_CHG); vm_page_dirty(m); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea_bootstrap before avail start and end are * calculated. */ static vm_offset_t moea_bootstrap_alloc(vm_size_t size, u_int align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea_bootstrap_alloc: could not allocate memory"); } static void moea_syncicache(vm_paddr_t pa, vm_size_t len) { __syncicache((void *)pa, len); } static int moea_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head, vm_offset_t va, vm_paddr_t pa, u_int pte_lo, int flags) { struct pvo_entry *pvo; u_int sr; int first; u_int ptegidx; int i; int bootstrap; moea_pvo_enter_calls++; first = 0; bootstrap = 0; /* * Compute the PTE Group index. */ va &= ~ADDR_POFF; sr = va_to_sr(pm->pm_sr, va); ptegidx = va_to_pteg(sr, va); /* * Remove any existing mapping for this page. Reuse the pvo entry if * there is a mapping. */ mtx_lock(&moea_table_mutex); LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { if ((pvo->pvo_pte.pte.pte_lo & PTE_RPGN) == pa && (pvo->pvo_pte.pte.pte_lo & PTE_PP) == (pte_lo & PTE_PP)) { /* * The PTE is not changing. Instead, this may * be a request to change the mapping's wired * attribute. */ mtx_unlock(&moea_table_mutex); if ((flags & PVO_WIRED) != 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) { pvo->pvo_vaddr |= PVO_WIRED; pm->pm_stats.wired_count++; } else if ((flags & PVO_WIRED) == 0 && (pvo->pvo_vaddr & PVO_WIRED) != 0) { pvo->pvo_vaddr &= ~PVO_WIRED; pm->pm_stats.wired_count--; } return (0); } moea_pvo_remove(pvo, -1); break; } } /* * If we aren't overwriting a mapping, try to allocate. */ if (moea_initialized) { pvo = uma_zalloc(zone, M_NOWAIT); } else { if (moea_bpvo_pool_index >= BPVO_POOL_SIZE) { panic("moea_enter: bpvo pool exhausted, %d, %d, %d", moea_bpvo_pool_index, BPVO_POOL_SIZE, BPVO_POOL_SIZE * sizeof(struct pvo_entry)); } pvo = &moea_bpvo_pool[moea_bpvo_pool_index]; moea_bpvo_pool_index++; bootstrap = 1; } if (pvo == NULL) { mtx_unlock(&moea_table_mutex); return (ENOMEM); } moea_pvo_entries++; pvo->pvo_vaddr = va; pvo->pvo_pmap = pm; LIST_INSERT_HEAD(&moea_pvo_table[ptegidx], pvo, pvo_olink); pvo->pvo_vaddr &= ~ADDR_POFF; if (flags & PVO_WIRED) pvo->pvo_vaddr |= PVO_WIRED; if (pvo_head != &moea_pvo_kunmanaged) pvo->pvo_vaddr |= PVO_MANAGED; if (bootstrap) pvo->pvo_vaddr |= PVO_BOOTSTRAP; moea_pte_create(&pvo->pvo_pte.pte, sr, va, pa | pte_lo); /* * Add to pmap list */ RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo); /* * Remember if the list was empty and therefore will be the first * item. */ if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); if (pvo->pvo_vaddr & PVO_WIRED) pm->pm_stats.wired_count++; pm->pm_stats.resident_count++; i = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte); KASSERT(i < 8, ("Invalid PTE index")); if (i >= 0) { PVO_PTEGIDX_SET(pvo, i); } else { panic("moea_pvo_enter: overflow"); moea_pte_overflow++; } mtx_unlock(&moea_table_mutex); return (first ? ENOENT : 0); } static void moea_pvo_remove(struct pvo_entry *pvo, int pteidx) { struct pte *pt; /* * If there is an active pte entry, we need to deactivate it (and * save the ref & cfg bits). */ pt = moea_pvo_to_pte(pvo, pteidx); if (pt != NULL) { moea_pte_unset(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr); mtx_unlock(&moea_table_mutex); PVO_PTEGIDX_CLR(pvo); } else { moea_pte_overflow--; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Save the REF/CHG bits into their cache if the page is managed. */ if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED) { struct vm_page *pg; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pte.pte_lo & PTE_RPGN); if (pg != NULL) { moea_attr_save(pg, pvo->pvo_pte.pte.pte_lo & (PTE_REF | PTE_CHG)); } } /* * Remove this PVO from the PV and pmap lists. */ LIST_REMOVE(pvo, pvo_vlink); RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Remove this from the overflow list and return it to the pool * if we aren't going to reuse it. */ LIST_REMOVE(pvo, pvo_olink); if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(pvo->pvo_vaddr & PVO_MANAGED ? moea_mpvo_zone : moea_upvo_zone, pvo); moea_pvo_entries--; moea_pvo_remove_calls++; } static __inline int moea_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx) { int pteidx; /* * We can find the actual pte entry without searching by grabbing * the PTEG index from 3 unused bits in pte_lo[11:9] and by * noticing the HID bit. */ pteidx = ptegidx * 8 + PVO_PTEGIDX_GET(pvo); if (pvo->pvo_pte.pte.pte_hi & PTE_HID) pteidx ^= moea_pteg_mask * 8; return (pteidx); } static struct pvo_entry * moea_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p) { struct pvo_entry *pvo; int ptegidx; u_int sr; va &= ~ADDR_POFF; sr = va_to_sr(pm->pm_sr, va); ptegidx = va_to_pteg(sr, va); mtx_lock(&moea_table_mutex); LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) { if (pteidx_p) *pteidx_p = moea_pvo_pte_index(pvo, ptegidx); break; } } mtx_unlock(&moea_table_mutex); return (pvo); } static struct pte * moea_pvo_to_pte(const struct pvo_entry *pvo, int pteidx) { struct pte *pt; /* * If we haven't been supplied the ptegidx, calculate it. */ if (pteidx == -1) { int ptegidx; u_int sr; sr = va_to_sr(pvo->pvo_pmap->pm_sr, pvo->pvo_vaddr); ptegidx = va_to_pteg(sr, pvo->pvo_vaddr); pteidx = moea_pvo_pte_index(pvo, ptegidx); } pt = &moea_pteg_table[pteidx >> 3].pt[pteidx & 7]; mtx_lock(&moea_table_mutex); if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) && !PVO_PTEGIDX_ISSET(pvo)) { panic("moea_pvo_to_pte: pvo %p has valid pte in pvo but no " "valid pte index", pvo); } if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0 && PVO_PTEGIDX_ISSET(pvo)) { panic("moea_pvo_to_pte: pvo %p has valid pte index in pvo " "pvo but no valid pte", pvo); } if ((pt->pte_hi ^ (pvo->pvo_pte.pte.pte_hi & ~PTE_VALID)) == PTE_VALID) { if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0) { panic("moea_pvo_to_pte: pvo %p has valid pte in " "moea_pteg_table %p but invalid in pvo", pvo, pt); } if (((pt->pte_lo ^ pvo->pvo_pte.pte.pte_lo) & ~(PTE_CHG|PTE_REF)) != 0) { panic("moea_pvo_to_pte: pvo %p pte does not match " "pte %p in moea_pteg_table", pvo, pt); } mtx_assert(&moea_table_mutex, MA_OWNED); return (pt); } if (pvo->pvo_pte.pte.pte_hi & PTE_VALID) { panic("moea_pvo_to_pte: pvo %p has invalid pte %p in " "moea_pteg_table but valid in pvo: %8x, %8x", pvo, pt, pvo->pvo_pte.pte.pte_hi, pt->pte_hi); } mtx_unlock(&moea_table_mutex); return (NULL); } /* * XXX: THIS STUFF SHOULD BE IN pte.c? */ int moea_pte_spill(vm_offset_t addr) { struct pvo_entry *source_pvo, *victim_pvo; struct pvo_entry *pvo; int ptegidx, i, j; u_int sr; struct pteg *pteg; struct pte *pt; moea_pte_spills++; sr = mfsrin(addr); ptegidx = va_to_pteg(sr, addr); /* * Have to substitute some entry. Use the primary hash for this. * Use low bits of timebase as random generator. */ pteg = &moea_pteg_table[ptegidx]; mtx_lock(&moea_table_mutex); __asm __volatile("mftb %0" : "=r"(i)); i &= 7; pt = &pteg->pt[i]; source_pvo = NULL; victim_pvo = NULL; LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) { /* * We need to find a pvo entry for this address. */ if (source_pvo == NULL && moea_pte_match(&pvo->pvo_pte.pte, sr, addr, pvo->pvo_pte.pte.pte_hi & PTE_HID)) { /* * Now found an entry to be spilled into the pteg. * The PTE is now valid, so we know it's active. */ j = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte); if (j >= 0) { PVO_PTEGIDX_SET(pvo, j); moea_pte_overflow--; mtx_unlock(&moea_table_mutex); return (1); } source_pvo = pvo; if (victim_pvo != NULL) break; } /* * We also need the pvo entry of the victim we are replacing * so save the R & C bits of the PTE. */ if ((pt->pte_hi & PTE_HID) == 0 && victim_pvo == NULL && moea_pte_compare(pt, &pvo->pvo_pte.pte)) { victim_pvo = pvo; if (source_pvo != NULL) break; } } if (source_pvo == NULL) { mtx_unlock(&moea_table_mutex); return (0); } if (victim_pvo == NULL) { if ((pt->pte_hi & PTE_HID) == 0) panic("moea_pte_spill: victim p-pte (%p) has no pvo" "entry", pt); /* * If this is a secondary PTE, we need to search it's primary * pvo bucket for the matching PVO. */ LIST_FOREACH(pvo, &moea_pvo_table[ptegidx ^ moea_pteg_mask], pvo_olink) { /* * We also need the pvo entry of the victim we are * replacing so save the R & C bits of the PTE. */ if (moea_pte_compare(pt, &pvo->pvo_pte.pte)) { victim_pvo = pvo; break; } } if (victim_pvo == NULL) panic("moea_pte_spill: victim s-pte (%p) has no pvo" "entry", pt); } /* * We are invalidating the TLB entry for the EA we are replacing even * though it's valid. If we don't, we lose any ref/chg bit changes * contained in the TLB entry. */ source_pvo->pvo_pte.pte.pte_hi &= ~PTE_HID; moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr); moea_pte_set(pt, &source_pvo->pvo_pte.pte); PVO_PTEGIDX_CLR(victim_pvo); PVO_PTEGIDX_SET(source_pvo, i); moea_pte_replacements++; mtx_unlock(&moea_table_mutex); return (1); } static __inline struct pvo_entry * moea_pte_spillable_ident(u_int ptegidx) { struct pte *pt; struct pvo_entry *pvo_walk, *pvo = NULL; LIST_FOREACH(pvo_walk, &moea_pvo_table[ptegidx], pvo_olink) { if (pvo_walk->pvo_vaddr & PVO_WIRED) continue; if (!(pvo_walk->pvo_pte.pte.pte_hi & PTE_VALID)) continue; pt = moea_pvo_to_pte(pvo_walk, -1); if (pt == NULL) continue; pvo = pvo_walk; mtx_unlock(&moea_table_mutex); if (!(pt->pte_lo & PTE_REF)) return (pvo_walk); } return (pvo); } static int moea_pte_insert(u_int ptegidx, struct pte *pvo_pt) { struct pte *pt; struct pvo_entry *victim_pvo; int i; int victim_idx; u_int pteg_bkpidx = ptegidx; mtx_assert(&moea_table_mutex, MA_OWNED); /* * First try primary hash. */ for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) { if ((pt->pte_hi & PTE_VALID) == 0) { pvo_pt->pte_hi &= ~PTE_HID; moea_pte_set(pt, pvo_pt); return (i); } } /* * Now try secondary hash. */ ptegidx ^= moea_pteg_mask; for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) { if ((pt->pte_hi & PTE_VALID) == 0) { pvo_pt->pte_hi |= PTE_HID; moea_pte_set(pt, pvo_pt); return (i); } } /* Try again, but this time try to force a PTE out. */ ptegidx = pteg_bkpidx; victim_pvo = moea_pte_spillable_ident(ptegidx); if (victim_pvo == NULL) { ptegidx ^= moea_pteg_mask; victim_pvo = moea_pte_spillable_ident(ptegidx); } if (victim_pvo == NULL) { panic("moea_pte_insert: overflow"); return (-1); } victim_idx = moea_pvo_pte_index(victim_pvo, ptegidx); if (pteg_bkpidx == ptegidx) pvo_pt->pte_hi &= ~PTE_HID; else pvo_pt->pte_hi |= PTE_HID; /* * Synchronize the sacrifice PTE with its PVO, then mark both * invalid. The PVO will be reused when/if the VM system comes * here after a fault. */ pt = &moea_pteg_table[victim_idx >> 3].pt[victim_idx & 7]; if (pt->pte_hi != victim_pvo->pvo_pte.pte.pte_hi) panic("Victim PVO doesn't match PTE! PVO: %8x, PTE: %8x", victim_pvo->pvo_pte.pte.pte_hi, pt->pte_hi); /* * Set the new PTE. */ moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr); PVO_PTEGIDX_CLR(victim_pvo); moea_pte_overflow++; moea_pte_set(pt, pvo_pt); return (victim_idx & 7); } static boolean_t moea_query_bit(vm_page_t m, int ptebit) { struct pvo_entry *pvo; struct pte *pt; rw_assert(&pvh_global_lock, RA_WLOCKED); if (moea_attr_fetch(m) & ptebit) return (TRUE); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { /* * See if we saved the bit off. If so, cache it and return * success. */ if (pvo->pvo_pte.pte.pte_lo & ptebit) { moea_attr_save(m, ptebit); return (TRUE); } } /* * No luck, now go through the hard part of looking at the PTEs * themselves. Sync so that any pending REF/CHG bits are flushed to * the PTEs. */ powerpc_sync(); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, cache it and return success. */ pt = moea_pvo_to_pte(pvo, -1); if (pt != NULL) { moea_pte_synch(pt, &pvo->pvo_pte.pte); mtx_unlock(&moea_table_mutex); if (pvo->pvo_pte.pte.pte_lo & ptebit) { moea_attr_save(m, ptebit); return (TRUE); } } } return (FALSE); } static u_int moea_clear_bit(vm_page_t m, int ptebit) { u_int count; struct pvo_entry *pvo; struct pte *pt; rw_assert(&pvh_global_lock, RA_WLOCKED); /* * Clear the cached value. */ moea_attr_clear(m, ptebit); /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). note that since the pvo entries and * list heads are accessed via BAT0 and are never placed in the page * table, we don't have to worry about further accesses setting the * REF/CHG bits. */ powerpc_sync(); /* * For each pvo entry, clear the pvo's ptebit. If this pvo has a * valid pte clear the ptebit from the valid pte. */ count = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pt = moea_pvo_to_pte(pvo, -1); if (pt != NULL) { moea_pte_synch(pt, &pvo->pvo_pte.pte); if (pvo->pvo_pte.pte.pte_lo & ptebit) { count++; moea_pte_clear(pt, PVO_VADDR(pvo), ptebit); } mtx_unlock(&moea_table_mutex); } pvo->pvo_pte.pte.pte_lo &= ~ptebit; } return (count); } /* * Return true if the physical range is encompassed by the battable[idx] */ static int moea_bat_mapped(int idx, vm_paddr_t pa, vm_size_t size) { u_int prot; u_int32_t start; u_int32_t end; u_int32_t bat_ble; /* * Return immediately if not a valid mapping */ if (!(battable[idx].batu & BAT_Vs)) return (EINVAL); /* * The BAT entry must be cache-inhibited, guarded, and r/w * so it can function as an i/o page */ prot = battable[idx].batl & (BAT_I|BAT_G|BAT_PP_RW); if (prot != (BAT_I|BAT_G|BAT_PP_RW)) return (EPERM); /* * The address should be within the BAT range. Assume that the * start address in the BAT has the correct alignment (thus * not requiring masking) */ start = battable[idx].batl & BAT_PBS; bat_ble = (battable[idx].batu & ~(BAT_EBS)) | 0x03; end = start | (bat_ble << 15) | 0x7fff; if ((pa < start) || ((pa + size) > end)) return (ERANGE); return (0); } boolean_t moea_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { int i; /* * This currently does not work for entries that * overlap 256M BAT segments. */ for(i = 0; i < 16; i++) if (moea_bat_mapped(i, pa, size) == 0) return (0); return (EFAULT); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return (moea_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT)); } void * moea_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; int i; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); /* * If the physical address lies within a valid BAT table entry, * return the 1:1 mapping. This currently doesn't work * for regions that overlap 256M BAT segments. */ for (i = 0; i < 16; i++) { if (moea_bat_mapped(i, pa, size) == 0) return ((void *) pa); } va = kva_alloc(size); if (!va) panic("moea_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea_kenter_attr(mmu, tmpva, ppa, ma); tlbie(tmpva); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void moea_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; /* * If this is outside kernel virtual space, then it's a * battable entry and doesn't require unmapping */ if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= virtual_end)) { base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); kva_free(base, size); } } static void moea_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va); len = MIN(lim - va, sz); pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL); if (pvo != NULL) { pa = (pvo->pvo_pte.pte.pte_lo & PTE_RPGN) | (va & ADDR_POFF); moea_syncicache(pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL); if (pvo == NULL || !(pvo->pvo_pte.pte.pte_hi & PTE_VALID)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } Index: head/sys/powerpc/aim/mmu_oea64.c =================================================================== --- head/sys/powerpc/aim/mmu_oea64.c (revision 352406) +++ head/sys/powerpc/aim/mmu_oea64.c (revision 352407) @@ -1,2936 +1,2934 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2015 Nathan Whitehorn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_oea64.h" #include "mmu_if.h" #include "moea64_if.h" void moea64_release_vsid(uint64_t vsid); uintptr_t moea64_get_unique_vsid(void); #define DISABLE_TRANS(msr) msr = mfmsr(); mtmsr(msr & ~PSL_DR) #define ENABLE_TRANS(msr) mtmsr(msr) #define VSID_MAKE(sr, hash) ((sr) | (((hash) & 0xfffff) << 4)) #define VSID_TO_HASH(vsid) (((vsid) >> 4) & 0xfffff) #define VSID_HASH_MASK 0x0000007fffffffffULL /* * Locking semantics: * * There are two locks of interest: the page locks and the pmap locks, which * protect their individual PVO lists and are locked in that order. The contents * of all PVO entries are protected by the locks of their respective pmaps. * The pmap of any PVO is guaranteed not to change so long as the PVO is linked * into any list. * */ #define PV_LOCK_PER_DOM PA_LOCK_COUNT*3 #define PV_LOCK_COUNT PV_LOCK_PER_DOM*MAXMEMDOM static struct mtx_padalign pv_lock[PV_LOCK_COUNT]; /* * Cheap NUMA-izing of the pv locks, to reduce contention across domains. * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the * index at (N << 45). */ #ifdef __powerpc64__ #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_PER_DOM + \ (((pa) >> 45) % MAXMEMDOM) * PV_LOCK_PER_DOM) #else #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT) #endif #define PV_LOCKPTR(pa) ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)])) #define PV_LOCK(pa) mtx_lock(PV_LOCKPTR(pa)) #define PV_UNLOCK(pa) mtx_unlock(PV_LOCKPTR(pa)) #define PV_LOCKASSERT(pa) mtx_assert(PV_LOCKPTR(pa), MA_OWNED) #define PV_PAGE_LOCK(m) PV_LOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_UNLOCK(m) PV_UNLOCK(VM_PAGE_TO_PHYS(m)) #define PV_PAGE_LOCKASSERT(m) PV_LOCKASSERT(VM_PAGE_TO_PHYS(m)) struct ofw_map { cell_t om_va; cell_t om_len; uint64_t om_pa; cell_t om_mode; }; extern unsigned char _etext[]; extern unsigned char _end[]; extern void *slbtrap, *slbtrapend; /* * Map of physical memory regions. */ static struct mem_region *regions; static struct mem_region *pregions; static struct numa_mem_region *numa_pregions; static u_int phys_avail_count; static int regions_sz, pregions_sz, numapregions_sz; extern void bs_remap_earlyboot(void); /* * Lock for the SLB tables. */ struct mtx moea64_slb_mutex; /* * PTEG data. */ u_long moea64_pteg_count; u_long moea64_pteg_mask; /* * PVO data. */ uma_zone_t moea64_pvo_zone; /* zone for pvo entries */ static struct pvo_entry *moea64_bpvo_pool; static int moea64_bpvo_pool_index = 0; static int moea64_bpvo_pool_size = 327680; TUNABLE_INT("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size); SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD, &moea64_bpvo_pool_index, 0, ""); #define VSID_NBPW (sizeof(u_int32_t) * 8) #ifdef __powerpc64__ #define NVSIDS (NPMAPS * 16) #define VSID_HASHMASK 0xffffffffUL #else #define NVSIDS NPMAPS #define VSID_HASHMASK 0xfffffUL #endif static u_int moea64_vsid_bitmap[NVSIDS / VSID_NBPW]; static boolean_t moea64_initialized = FALSE; #ifdef MOEA64_STATS /* * Statistics. */ u_int moea64_pte_valid = 0; u_int moea64_pte_overflow = 0; u_int moea64_pvo_entries = 0; u_int moea64_pvo_enter_calls = 0; u_int moea64_pvo_remove_calls = 0; SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD, &moea64_pte_valid, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD, &moea64_pte_overflow, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD, &moea64_pvo_entries, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD, &moea64_pvo_enter_calls, 0, ""); SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD, &moea64_pvo_remove_calls, 0, ""); #endif vm_offset_t moea64_scratchpage_va[2]; struct pvo_entry *moea64_scratchpage_pvo[2]; struct mtx moea64_scratchpage_mtx; uint64_t moea64_large_page_mask = 0; uint64_t moea64_large_page_size = 0; int moea64_large_page_shift = 0; /* * PVO calls. */ static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvo); static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo); static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo); static void moea64_pvo_remove_from_page_locked(mmu_t mmu, struct pvo_entry *pvo, vm_page_t m); static struct pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t); /* * Utility routines. */ static boolean_t moea64_query_bit(mmu_t, vm_page_t, uint64_t); static u_int moea64_clear_bit(mmu_t, vm_page_t, uint64_t); static void moea64_kremove(mmu_t, vm_offset_t); static void moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz); static void moea64_pmap_init_qpages(void); /* * Kernel MMU interface */ void moea64_clear_modify(mmu_t, vm_page_t); void moea64_copy_page(mmu_t, vm_page_t, vm_page_t); void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize); int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t); vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); void moea64_init(mmu_t); boolean_t moea64_is_modified(mmu_t, vm_page_t); boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t); boolean_t moea64_is_referenced(mmu_t, vm_page_t); int moea64_ts_referenced(mmu_t, vm_page_t); vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t); void moea64_page_init(mmu_t, vm_page_t); int moea64_page_wired_mappings(mmu_t, vm_page_t); void moea64_pinit(mmu_t, pmap_t); void moea64_pinit0(mmu_t, pmap_t); void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int); void moea64_qremove(mmu_t, vm_offset_t, int); void moea64_release(mmu_t, pmap_t); void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_remove_pages(mmu_t, pmap_t); void moea64_remove_all(mmu_t, vm_page_t); void moea64_remove_write(mmu_t, vm_page_t); void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); void moea64_zero_page(mmu_t, vm_page_t); void moea64_zero_page_area(mmu_t, vm_page_t, int, int); void moea64_activate(mmu_t, struct thread *); void moea64_deactivate(mmu_t, struct thread *); void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t); void *moea64_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t); vm_paddr_t moea64_kextract(mmu_t, vm_offset_t); void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma); void moea64_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma); void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t); boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va); void moea64_scan_init(mmu_t mmu); vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m); void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr); static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static mmu_method_t moea64_methods[] = { MMUMETHOD(mmu_clear_modify, moea64_clear_modify), MMUMETHOD(mmu_copy_page, moea64_copy_page), MMUMETHOD(mmu_copy_pages, moea64_copy_pages), MMUMETHOD(mmu_enter, moea64_enter), MMUMETHOD(mmu_enter_object, moea64_enter_object), MMUMETHOD(mmu_enter_quick, moea64_enter_quick), MMUMETHOD(mmu_extract, moea64_extract), MMUMETHOD(mmu_extract_and_hold, moea64_extract_and_hold), MMUMETHOD(mmu_init, moea64_init), MMUMETHOD(mmu_is_modified, moea64_is_modified), MMUMETHOD(mmu_is_prefaultable, moea64_is_prefaultable), MMUMETHOD(mmu_is_referenced, moea64_is_referenced), MMUMETHOD(mmu_ts_referenced, moea64_ts_referenced), MMUMETHOD(mmu_map, moea64_map), MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick), MMUMETHOD(mmu_page_init, moea64_page_init), MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings), MMUMETHOD(mmu_pinit, moea64_pinit), MMUMETHOD(mmu_pinit0, moea64_pinit0), MMUMETHOD(mmu_protect, moea64_protect), MMUMETHOD(mmu_qenter, moea64_qenter), MMUMETHOD(mmu_qremove, moea64_qremove), MMUMETHOD(mmu_release, moea64_release), MMUMETHOD(mmu_remove, moea64_remove), MMUMETHOD(mmu_remove_pages, moea64_remove_pages), MMUMETHOD(mmu_remove_all, moea64_remove_all), MMUMETHOD(mmu_remove_write, moea64_remove_write), MMUMETHOD(mmu_sync_icache, moea64_sync_icache), MMUMETHOD(mmu_unwire, moea64_unwire), MMUMETHOD(mmu_zero_page, moea64_zero_page), MMUMETHOD(mmu_zero_page_area, moea64_zero_page_area), MMUMETHOD(mmu_activate, moea64_activate), MMUMETHOD(mmu_deactivate, moea64_deactivate), MMUMETHOD(mmu_page_set_memattr, moea64_page_set_memattr), MMUMETHOD(mmu_quick_enter_page, moea64_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, moea64_quick_remove_page), /* Internal interfaces */ MMUMETHOD(mmu_mapdev, moea64_mapdev), MMUMETHOD(mmu_mapdev_attr, moea64_mapdev_attr), MMUMETHOD(mmu_unmapdev, moea64_unmapdev), MMUMETHOD(mmu_kextract, moea64_kextract), MMUMETHOD(mmu_kenter, moea64_kenter), MMUMETHOD(mmu_kenter_attr, moea64_kenter_attr), MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped), MMUMETHOD(mmu_scan_init, moea64_scan_init), MMUMETHOD(mmu_dumpsys_map, moea64_dumpsys_map), MMUMETHOD(mmu_map_user_ptr, moea64_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, moea64_decode_kernel_ptr), { 0, 0 } }; MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0); static struct pvo_head * vm_page_to_pvoh(vm_page_t m) { mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED); return (&m->md.mdpg_pvoh); } static struct pvo_entry * alloc_pvo_entry(int bootstrap) { struct pvo_entry *pvo; if (!moea64_initialized || bootstrap) { if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) { panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd", moea64_bpvo_pool_index, moea64_bpvo_pool_size, moea64_bpvo_pool_size * sizeof(struct pvo_entry)); } pvo = &moea64_bpvo_pool[ atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)]; bzero(pvo, sizeof(*pvo)); pvo->pvo_vaddr = PVO_BOOTSTRAP; } else pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO); return (pvo); } static void init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va) { uint64_t vsid; uint64_t hash; int shift; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pvo->pvo_pmap = pmap; va &= ~ADDR_POFF; pvo->pvo_vaddr |= va; vsid = va_to_vsid(pmap, va); pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT) | (vsid << 16); shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift : ADDR_PIDX_SHFT; hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift); pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3; } static void free_pvo_entry(struct pvo_entry *pvo) { if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP)) uma_zfree(moea64_pvo_zone, pvo); } void moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte) { lpte->pte_hi = (pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & LPTE_AVPN_MASK; lpte->pte_hi |= LPTE_VALID; if (pvo->pvo_vaddr & PVO_LARGE) lpte->pte_hi |= LPTE_BIG; if (pvo->pvo_vaddr & PVO_WIRED) lpte->pte_hi |= LPTE_WIRED; if (pvo->pvo_vaddr & PVO_HID) lpte->pte_hi |= LPTE_HID; lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) lpte->pte_lo |= LPTE_BW; else lpte->pte_lo |= LPTE_BR; if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE)) lpte->pte_lo |= LPTE_NOEXEC; } static __inline uint64_t moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint64_t pte_lo; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (LPTE_I | LPTE_G); case VM_MEMATTR_CACHEABLE: return (LPTE_M); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (LPTE_I); case VM_MEMATTR_WRITE_THROUGH: return (LPTE_W | LPTE_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ pte_lo = LPTE_I | LPTE_G; for (i = 0; i < pregions_sz; i++) { if ((pa >= pregions[i].mr_start) && (pa < (pregions[i].mr_start + pregions[i].mr_size))) { pte_lo &= ~(LPTE_I | LPTE_G); pte_lo |= LPTE_M; break; } } return pte_lo; } /* * Quick sort callout for comparing memory regions. */ static int om_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b) { const struct ofw_map *mapa; const struct ofw_map *mapb; mapa = a; mapb = b; if (mapa->om_pa < mapb->om_pa) return (-1); else if (mapa->om_pa > mapb->om_pa) return (1); else return (0); } static void moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz) { struct ofw_map translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */ pcell_t acells, trans_cells[sz/sizeof(cell_t)]; struct pvo_entry *pvo; register_t msr; vm_offset_t off; vm_paddr_t pa_base; int i, j; bzero(translations, sz); OF_getencprop(OF_finddevice("/"), "#address-cells", &acells, sizeof(acells)); if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1) panic("moea64_bootstrap: can't get ofw translations"); CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations"); sz /= sizeof(cell_t); for (i = 0, j = 0; i < sz; j++) { translations[j].om_va = trans_cells[i++]; translations[j].om_len = trans_cells[i++]; translations[j].om_pa = trans_cells[i++]; if (acells == 2) { translations[j].om_pa <<= 32; translations[j].om_pa |= trans_cells[i++]; } translations[j].om_mode = trans_cells[i++]; } KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)", i, sz)); sz = j; qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { pa_base = translations[i].om_pa; #ifndef __powerpc64__ if ((translations[i].om_pa >> 32) != 0) panic("OFW translations above 32-bit boundary!"); #endif if (pa_base % PAGE_SIZE) panic("OFW translation not page-aligned (phys)!"); if (translations[i].om_va % PAGE_SIZE) panic("OFW translation not page-aligned (virt)!"); CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x", pa_base, translations[i].om_va, translations[i].om_len); /* Now enter the pages for this mapping */ DISABLE_TRANS(msr); for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) { /* If this address is direct-mapped, skip remapping */ if (hw_direct_map && translations[i].om_va == PHYS_TO_DMAP(pa_base) && moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT) == LPTE_M) continue; PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, translations[i].om_va + off); PMAP_UNLOCK(kernel_pmap); if (pvo != NULL) continue; moea64_kenter(mmup, translations[i].om_va + off, pa_base + off); } ENABLE_TRANS(msr); } } #ifdef __powerpc64__ static void moea64_probe_large_page(void) { uint16_t pvr = mfpvr() >> 16; switch (pvr) { case IBM970: case IBM970FX: case IBM970MP: powerpc_sync(); isync(); mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG); powerpc_sync(); isync(); /* FALLTHROUGH */ default: if (moea64_large_page_size == 0) { moea64_large_page_size = 0x1000000; /* 16 MB */ moea64_large_page_shift = 24; } } moea64_large_page_mask = moea64_large_page_size - 1; } static void moea64_bootstrap_slb_prefault(vm_offset_t va, int large) { struct slb *cache; struct slb entry; uint64_t esid, slbe; uint64_t i; cache = PCPU_GET(aim.slb); esid = va >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; for (i = 0; i < 64; i++) { if (cache[i].slbe == (slbe | i)) return; } entry.slbe = slbe; entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT; if (large) entry.slbv |= SLBV_L; slb_insert_kernel(entry.slbe, entry.slbv); } #endif static void moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { struct pvo_entry *pvo; register_t msr; vm_paddr_t pa, pkernelstart, pkernelend; vm_offset_t size, off; uint64_t pte_lo; int i; if (moea64_large_page_size == 0) hw_direct_map = 0; DISABLE_TRANS(msr); if (hw_direct_map) { PMAP_LOCK(kernel_pmap); for (i = 0; i < pregions_sz; i++) { for (pa = pregions[i].mr_start; pa < pregions[i].mr_start + pregions[i].mr_size; pa += moea64_large_page_size) { pte_lo = LPTE_M; pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE; init_pvo_entry(pvo, kernel_pmap, PHYS_TO_DMAP(pa)); /* * Set memory access as guarded if prefetch within * the page could exit the available physmem area. */ if (pa & moea64_large_page_mask) { pa &= moea64_large_page_mask; pte_lo |= LPTE_G; } if (pa + moea64_large_page_size > pregions[i].mr_start + pregions[i].mr_size) pte_lo |= LPTE_G; pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = pa | pte_lo; moea64_pvo_enter(mmup, pvo, NULL, NULL); } } PMAP_UNLOCK(kernel_pmap); } /* * Make sure the kernel and BPVO pool stay mapped on systems either * without a direct map or on which the kernel is not already executing * out of the direct-mapped region. */ if (kernelstart < DMAP_BASE_ADDRESS) { /* * For pre-dmap execution, we need to use identity mapping * because we will be operating with the mmu on but in the * wrong address configuration until we __restartkernel(). */ for (pa = kernelstart & ~PAGE_MASK; pa < kernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); } else if (!hw_direct_map) { pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS; pkernelend = kernelend & ~DMAP_BASE_ADDRESS; for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend; pa += PAGE_SIZE) moea64_kenter(mmup, pa | DMAP_BASE_ADDRESS, pa); } if (!hw_direct_map) { size = moea64_bpvo_pool_size*sizeof(struct pvo_entry); off = (vm_offset_t)(moea64_bpvo_pool); for (pa = off; pa < off + size; pa += PAGE_SIZE) moea64_kenter(mmup, pa, pa); /* Map exception vectors */ for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE) moea64_kenter(mmup, pa | DMAP_BASE_ADDRESS, pa); } ENABLE_TRANS(msr); /* * Allow user to override unmapped_buf_allowed for testing. * XXXKIB Only direct map implementation was tested. */ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed)) unmapped_buf_allowed = hw_direct_map; } /* Quick sort callout for comparing physical addresses. */ static int pa_cmp(const void *a, const void *b) { const vm_paddr_t *pa = a, *pb = b; if (*pa < *pb) return (-1); else if (*pa > *pb) return (1); else return (0); } void moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i, j; vm_size_t physsz, hwphyssz; vm_paddr_t kernelphysstart, kernelphysend; int rm_pavail; #ifndef __powerpc64__ /* We don't have a direct map since there is no BAT */ hw_direct_map = 0; /* Make sure battable is zero, since we have no BAT */ for (i = 0; i < 16; i++) { battable[i].batu = 0; battable[i].batl = 0; } #else moea64_probe_large_page(); /* Use a direct map if we have large page support */ if (moea64_large_page_size > 0) hw_direct_map = 1; else hw_direct_map = 0; /* Install trap handlers for SLBs */ bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap); bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap); __syncicache((void *)EXC_DSE, 0x80); __syncicache((void *)EXC_ISE, 0x80); #endif kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS; kernelphysend = kernelend & ~DMAP_BASE_ADDRESS; /* Get physical memory regions from firmware */ mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); CTR0(KTR_PMAP, "moea64_bootstrap: physical memory"); if (PHYS_AVAIL_ENTRIES < regions_sz) panic("moea64_bootstrap: phys_avail too small"); phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); for (i = 0, j = 0; i < regions_sz; i++, j += 2) { CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)", regions[i].mr_start, regions[i].mr_start + regions[i].mr_size, regions[i].mr_size); if (hwphyssz != 0 && (physsz + regions[i].mr_size) >= hwphyssz) { if (physsz < hwphyssz) { phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = regions[i].mr_start; phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size; phys_avail_count++; physsz += regions[i].mr_size; } /* Check for overlap with the kernel and exception vectors */ rm_pavail = 0; for (j = 0; j < 2*phys_avail_count; j+=2) { if (phys_avail[j] < EXC_LAST) phys_avail[j] += EXC_LAST; if (phys_avail[j] >= kernelphysstart && phys_avail[j+1] <= kernelphysend) { phys_avail[j] = phys_avail[j+1] = ~0; rm_pavail++; continue; } if (kernelphysstart >= phys_avail[j] && kernelphysstart < phys_avail[j+1]) { if (kernelphysend < phys_avail[j+1]) { phys_avail[2*phys_avail_count] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; phys_avail[2*phys_avail_count + 1] = phys_avail[j+1]; phys_avail_count++; } phys_avail[j+1] = kernelphysstart & ~PAGE_MASK; } if (kernelphysend >= phys_avail[j] && kernelphysend < phys_avail[j+1]) { if (kernelphysstart > phys_avail[j]) { phys_avail[2*phys_avail_count] = phys_avail[j]; phys_avail[2*phys_avail_count + 1] = kernelphysstart & ~PAGE_MASK; phys_avail_count++; } phys_avail[j] = (kernelphysend & ~PAGE_MASK) + PAGE_SIZE; } } /* Remove physical available regions marked for removal (~0) */ if (rm_pavail) { qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]), pa_cmp); phys_avail_count -= rm_pavail; for (i = 2*phys_avail_count; i < 2*(phys_avail_count + rm_pavail); i+=2) phys_avail[i] = phys_avail[i+1] = 0; } physmem = btoc(physsz); #ifdef PTEGCOUNT moea64_pteg_count = PTEGCOUNT; #else moea64_pteg_count = 0x1000; while (moea64_pteg_count < physmem) moea64_pteg_count <<= 1; moea64_pteg_count >>= 1; #endif /* PTEGCOUNT */ } void moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { int i; /* * Set PTEG mask */ moea64_pteg_mask = moea64_pteg_count - 1; /* * Initialize SLB table lock and page locks */ mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF); for (i = 0; i < PV_LOCK_COUNT; i++) mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF); /* * Initialise the bootstrap pvo pool. */ moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc( moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE); moea64_bpvo_pool_index = 0; /* Place at address usable through the direct map */ if (hw_direct_map) moea64_bpvo_pool = (struct pvo_entry *) PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool); /* * Make sure kernel vsid is allocated as well as VSID 0. */ #ifndef __powerpc64__ moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW] |= 1 << (KERNEL_VSIDBITS % VSID_NBPW); moea64_vsid_bitmap[0] |= 1; #endif /* * Initialize the kernel pmap (which is statically allocated). */ #ifdef __powerpc64__ for (i = 0; i < 64; i++) { pcpup->pc_aim.slb[i].slbv = 0; pcpup->pc_aim.slb[i].slbe = 0; } #else for (i = 0; i < 16; i++) kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i; #endif kernel_pmap->pmap_phys = kernel_pmap; CPU_FILL(&kernel_pmap->pm_active); RB_INIT(&kernel_pmap->pmap_pvo); PMAP_LOCK_INIT(kernel_pmap); /* * Now map in all the other buffers we allocated earlier */ moea64_setup_direct_map(mmup, kernelstart, kernelend); } void moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) { ihandle_t mmui; phandle_t chosen; phandle_t mmu; ssize_t sz; int i; vm_offset_t pa, va; void *dpcpu; /* * Set up the Open Firmware pmap and add its mappings if not in real * mode. */ chosen = OF_finddevice("/chosen"); if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) { mmu = OF_instance_to_package(mmui); if (mmu == -1 || (sz = OF_getproplen(mmu, "translations")) == -1) sz = 0; if (sz > 6144 /* tmpstksz - 2 KB headroom */) panic("moea64_bootstrap: too many ofw translations"); if (sz > 0) moea64_add_ofw_mappings(mmup, mmu, sz); } /* * Calculate the last available physical address. */ Maxmem = 0; for (i = 0; phys_avail[i + 2] != 0; i += 2) Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1])); /* * Initialize MMU. */ MMU_CPU_BOOTSTRAP(mmup,0); mtmsr(mfmsr() | PSL_DR | PSL_IR); pmap_bootstrapped++; /* * Set the start and end of kva. */ virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS; /* * Map the entire KVA range into the SLB. We must not fault there. */ #ifdef __powerpc64__ for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH) moea64_bootstrap_slb_prefault(va, 0); #endif /* * Remap any early IO mappings (console framebuffer, etc.) */ bs_remap_earlyboot(); /* * Figure out how far we can extend virtual_end into segment 16 * without running into existing mappings. Segment 16 is guaranteed * to contain neither RAM nor devices (at least on Apple hardware), * but will generally contain some OFW mappings we should not * step on. */ #ifndef __powerpc64__ /* KVA is in high memory on PPC64 */ PMAP_LOCK(kernel_pmap); while (virtual_end < VM_MAX_KERNEL_ADDRESS && moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL) virtual_end += PAGE_SIZE; PMAP_UNLOCK(kernel_pmap); #endif /* * Allocate a kernel stack with a guard page for thread0 and map it * into the kernel page map. */ pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE); va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; virtual_avail = va + kstack_pages * PAGE_SIZE; CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va); thread0.td_kstack = va; thread0.td_kstack_pages = kstack_pages; for (i = 0; i < kstack_pages; i++) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the message buffer. */ pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE); msgbufp = (struct msgbuf *)virtual_avail; va = virtual_avail; virtual_avail += round_page(msgbufsize); while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } /* * Allocate virtual address space for the dynamic percpu area. */ pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE); dpcpu = (void *)virtual_avail; va = virtual_avail; virtual_avail += DPCPU_SIZE; while (va < virtual_avail) { moea64_kenter(mmup, va, pa); pa += PAGE_SIZE; va += PAGE_SIZE; } dpcpu_init(dpcpu, curcpu); /* * Allocate some things for page zeroing. We put this directly * in the page table and use MOEA64_PTE_REPLACE to avoid any * of the PVO book-keeping or other parts of the VM system * from even knowing that this hack exists. */ if (!hw_direct_map) { mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL, MTX_DEF); for (i = 0; i < 2; i++) { moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE; virtual_end -= PAGE_SIZE; moea64_kenter(mmup, moea64_scratchpage_va[i], 0); PMAP_LOCK(kernel_pmap); moea64_scratchpage_pvo[i] = moea64_pvo_find_va( kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]); PMAP_UNLOCK(kernel_pmap); } } numa_mem_regions(&numa_pregions, &numapregions_sz); } static void moea64_pmap_init_qpages(void) { struct pcpu *pc; int i; if (hw_direct_map) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); PMAP_LOCK(kernel_pmap); pc->pc_aim.qmap_pvo = moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr); PMAP_UNLOCK(kernel_pmap); mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL); /* * Activate a user pmap. This mostly involves setting some non-CPU * state. */ void moea64_activate(mmu_t mmu, struct thread *td) { pmap_t pm; pm = &td->td_proc->p_vmspace->vm_pmap; CPU_SET(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, pm->pm_slb); __asm __volatile("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #else PCPU_SET(curpmap, pm->pmap_phys); mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid); #endif } void moea64_deactivate(mmu_t mmu, struct thread *td) { pmap_t pm; __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR)); pm = &td->td_proc->p_vmspace->vm_pmap; CPU_CLR(PCPU_GET(cpuid), &pm->pm_active); #ifdef __powerpc64__ PCPU_SET(aim.userslb, NULL); #else PCPU_SET(curpmap, NULL); #endif } void moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry key, *pvo; vm_page_t m; int64_t refchg; key.pvo_vaddr = sva; PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) { if ((pvo->pvo_vaddr & PVO_WIRED) == 0) panic("moea64_unwire: pvo %p is missing PVO_WIRED", pvo); pvo->pvo_vaddr &= ~PVO_WIRED; refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */); if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { if (refchg < 0) refchg = LPTE_CHG; m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } pm->pm_stats.wired_count--; } PMAP_UNLOCK(pm); } /* * This goes through and sets the physical address of our * special scratch PTE to the PA we want to zero or copy. Because * of locking issues (this can get called in pvo_enter() by * the UMA allocator), we can't use most other utility functions here */ static __inline void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_paddr_t pa) { struct pvo_entry *pvo; KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!")); mtx_assert(&moea64_scratchpage_mtx, MA_OWNED); pvo = moea64_scratchpage_pvo[which]; PMAP_LOCK(pvo->pvo_pmap); pvo->pvo_pte.pa = moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmup, pvo, MOEA64_PTE_INVALIDATE); PMAP_UNLOCK(pvo->pvo_pmap); isync(); } void moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst) { vm_offset_t dst; vm_offset_t src; dst = VM_PAGE_TO_PHYS(mdst); src = VM_PAGE_TO_PHYS(msrc); if (hw_direct_map) { bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst), PAGE_SIZE); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, src); moea64_set_scratchpage_pa(mmu, 1, dst); bcopy((void *)moea64_scratchpage_va[0], (void *)moea64_scratchpage_va[1], PAGE_SIZE); mtx_unlock(&moea64_scratchpage_mtx); } } static inline void moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); a_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); b_cp = (char *)(uintptr_t)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } static inline void moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; mtx_lock(&moea64_scratchpage_mtx); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); moea64_set_scratchpage_pa(mmu, 0, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); moea64_set_scratchpage_pa(mmu, 1, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(&moea64_scratchpage_mtx); } void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { if (hw_direct_map) { moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset, xfersize); } else { moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset, xfersize); } } void moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (size + off > PAGE_SIZE) panic("moea64_zero_page: size + off > PAGE_SIZE"); if (hw_direct_map) { bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size); } else { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); bzero((caddr_t)moea64_scratchpage_va[0] + off, size); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Zero a page of physical memory by temporarily mapping it */ void moea64_zero_page(mmu_t mmu, vm_page_t m) { vm_paddr_t pa = VM_PAGE_TO_PHYS(m); vm_offset_t va, off; if (!hw_direct_map) { mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 0, pa); va = moea64_scratchpage_va[0]; } else { va = PHYS_TO_DMAP(pa); } for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); if (!hw_direct_map) mtx_unlock(&moea64_scratchpage_mtx); } vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; vm_paddr_t pa = VM_PAGE_TO_PHYS(m); if (hw_direct_map) return (PHYS_TO_DMAP(pa)); /* * MOEA64_PTE_REPLACE does some locking, so we can't just grab * a critical section and access the PCPU data like on i386. * Instead, pin the thread and grab the PCPU lock to prevent * a preempting thread from using the same PCPU data. */ sched_pin(); mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED); pvo = PCPU_GET(aim.qmap_pvo); mtx_lock(PCPU_PTR(aim.qmap_lock)); pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) | (uint64_t)pa; MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); isync(); return (PCPU_GET(qmap_addr)); } void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr) { if (hw_direct_map) return; mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED); KASSERT(PCPU_GET(qmap_addr) == addr, ("moea64_quick_remove_page: invalid address")); mtx_unlock(PCPU_PTR(aim.qmap_lock)); sched_unpin(); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct pvo_entry *pvo, *oldpvo; struct pvo_head *pvo_head; uint64_t pte_lo; int error; if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pvo = alloc_pvo_entry(0); if (pvo == NULL) return (KERN_RESOURCE_SHORTAGE); pvo->pvo_pmap = NULL; /* to be filled in later */ pvo->pvo_pte.prot = prot; pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m)); pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo; if ((flags & PMAP_ENTER_WIRED) != 0) pvo->pvo_vaddr |= PVO_WIRED; if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) { pvo_head = NULL; } else { pvo_head = &m->md.mdpg_pvoh; pvo->pvo_vaddr |= PVO_MANAGED; } PV_PAGE_LOCK(m); PMAP_LOCK(pmap); if (pvo->pvo_pmap == NULL) init_pvo_entry(pvo, pmap, va); if (prot & VM_PROT_WRITE) if (pmap_bootstrapped && (m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); error = moea64_pvo_enter(mmu, pvo, pvo_head, &oldpvo); if (error == EEXIST) { if (oldpvo->pvo_vaddr == pvo->pvo_vaddr && oldpvo->pvo_pte.pa == pvo->pvo_pte.pa && oldpvo->pvo_pte.prot == prot) { /* Identical mapping already exists */ error = 0; /* If not in page table, reinsert it */ if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) { STAT_MOEA64(moea64_pte_overflow--); MOEA64_PTE_INSERT(mmu, oldpvo); } /* Then just clean up and go home */ PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); free_pvo_entry(pvo); goto out; } else { /* Otherwise, need to kill it first */ KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old " "mapping does not match new mapping")); moea64_pvo_remove_from_pmap(mmu, oldpvo); moea64_pvo_enter(mmu, pvo, pvo_head, NULL); } } PV_PAGE_UNLOCK(m); PMAP_UNLOCK(pmap); /* Free any dead pages */ if (error == EEXIST) { moea64_pvo_remove_from_page(mmu, oldpvo); free_pvo_entry(oldpvo); } out: /* * Flush the page from the instruction cache if this page is * mapped executable and cacheable. */ - if (pmap != kernel_pmap && (vm_page_aflags(m) & PGA_EXECUTABLE) != 0 && + if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) && (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { vm_page_aflag_set(m, PGA_EXECUTABLE); moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE); } return (KERN_SUCCESS); } static void moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_paddr_t pa, vm_size_t sz) { /* * This is much trickier than on older systems because * we can't sync the icache on physical addresses directly * without a direct map. Instead we check a couple of cases * where the memory is already mapped in and, failing that, * use the same trick we use for page zeroing to create * a temporary mapping for this physical address. */ if (!pmap_bootstrapped) { /* * If PMAP is not bootstrapped, we are likely to be * in real mode. */ __syncicache((void *)(uintptr_t)pa, sz); } else if (pmap == kernel_pmap) { __syncicache((void *)va, sz); } else if (hw_direct_map) { __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz); } else { /* Use the scratch page to set up a temp mapping */ mtx_lock(&moea64_scratchpage_mtx); moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF); __syncicache((void *)(moea64_scratchpage_va[1] + (va & ADDR_POFF)), sz); mtx_unlock(&moea64_scratchpage_mtx); } } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { moea64_enter(mmu, pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); m = TAILQ_NEXT(m, listq); } } void moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); } vm_paddr_t moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; PMAP_LOCK(pm); pvo = moea64_pvo_find_va(pm, va); if (pvo == NULL) pa = 0; else pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { struct pvo_entry *pvo; vm_page_t m; m = NULL; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) { m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } static mmu_t installed_mmu; static void * moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { struct pvo_entry *pvo; vm_offset_t va; vm_page_t m; int needed_lock; /* * This entire routine is a horrible hack to avoid bothering kmem * for new KVA addresses. Because this can get called from inside * kmem allocation routines, calling kmem for a new address here * can lead to multiply locking non-recursive mutexes. */ *flags = UMA_SLAB_PRIV; needed_lock = !PMAP_LOCKED(kernel_pmap); m = vm_page_alloc_domain(NULL, 0, domain, malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ); if (m == NULL) return (NULL); va = VM_PAGE_TO_PHYS(m); pvo = alloc_pvo_entry(1 /* bootstrap */); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE; pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M; if (needed_lock) PMAP_LOCK(kernel_pmap); init_pvo_entry(pvo, kernel_pmap, va); pvo->pvo_vaddr |= PVO_WIRED; moea64_pvo_enter(installed_mmu, pvo, NULL, NULL); if (needed_lock) PMAP_UNLOCK(kernel_pmap); if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0) bzero((void *)va, PAGE_SIZE); return (void *)va; } extern int elf32_nxstack; void moea64_init(mmu_t mmu) { CTR0(KTR_PMAP, "moea64_init"); moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); if (!hw_direct_map) { installed_mmu = mmu; uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc); } #ifdef COMPAT_FREEBSD32 elf32_nxstack = 1; #endif moea64_initialized = TRUE; } boolean_t moea64_is_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_referenced: page %p is not managed", m)); return (moea64_query_bit(mmu, m, LPTE_REF)); } boolean_t moea64_is_modified(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have LPTE_CHG set. */ VM_OBJECT_ASSERT_LOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (moea64_query_bit(mmu, m, LPTE_CHG)); } boolean_t moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { struct pvo_entry *pvo; boolean_t rv = TRUE; PMAP_LOCK(pmap); pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF); if (pvo != NULL) rv = FALSE; PMAP_UNLOCK(pmap); return (rv); } void moea64_clear_modify(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("moea64_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG * set. If the object containing the page is locked and the page is * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; moea64_clear_bit(mmu, m, LPTE_CHG); } /* * Clear the write and modified bits in each of the given page's mappings. */ void moea64_remove_write(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int64_t refchg, ret; pmap_t pmap; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; powerpc_sync(); PV_PAGE_LOCK(m); refchg = 0; LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pvo->pvo_pte.prot &= ~VM_PROT_WRITE; ret = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (ret < 0) ret = LPTE_CHG; refchg |= ret; if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG) vm_page_dirty(m); vm_page_aflag_clear(m, PGA_WRITEABLE); PV_PAGE_UNLOCK(m); } /* * moea64_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int moea64_ts_referenced(mmu_t mmu, vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_ts_referenced: page %p is not managed", m)); return (moea64_clear_bit(mmu, m, LPTE_REF)); } /* * Modify the WIMG settings of all mappings for a page. */ void moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { struct pvo_entry *pvo; int64_t refchg; pmap_t pmap; uint64_t lo; if ((m->oflags & VPO_UNMANAGED) != 0) { m->md.mdpg_cache_attrs = ma; return; } lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) { pvo->pvo_pte.pa &= ~LPTE_WIMG; pvo->pvo_pte.pa |= lo; refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE); if (refchg < 0) refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ? LPTE_CHG : 0; if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&m->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(m); if (refchg & LPTE_REF) vm_page_aflag_set(m, PGA_REFERENCED); } if (pvo->pvo_pmap == kernel_pmap) isync(); } PMAP_UNLOCK(pmap); } m->md.mdpg_cache_attrs = ma; PV_PAGE_UNLOCK(m); } /* * Map a wired page into kernel virtual address space. */ void moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { int error; struct pvo_entry *pvo, *oldpvo; do { pvo = alloc_pvo_entry(0); if (pvo == NULL) vm_wait(NULL); } while (pvo == NULL); pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma); pvo->pvo_vaddr |= PVO_WIRED; PMAP_LOCK(kernel_pmap); oldpvo = moea64_pvo_find_va(kernel_pmap, va); if (oldpvo != NULL) moea64_pvo_remove_from_pmap(mmu, oldpvo); init_pvo_entry(pvo, kernel_pmap, va); error = moea64_pvo_enter(mmu, pvo, NULL, NULL); PMAP_UNLOCK(kernel_pmap); /* Free any dead pages */ if (oldpvo != NULL) { moea64_pvo_remove_from_page(mmu, oldpvo); free_pvo_entry(oldpvo); } if (error != 0 && error != ENOENT) panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va, (uintmax_t)pa, error); } void moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t moea64_kextract(mmu_t mmu, vm_offset_t va) { struct pvo_entry *pvo; vm_paddr_t pa; /* * Shortcut the direct-mapped case when applicable. We never put * anything but 1:1 (or 62-bit aliased) mappings below * VM_MIN_KERNEL_ADDRESS. */ if (va < VM_MIN_KERNEL_ADDRESS) return (va & ~DMAP_BASE_ADDRESS); PMAP_LOCK(kernel_pmap); pvo = moea64_pvo_find_va(kernel_pmap, va); KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR, va)); pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo)); PMAP_UNLOCK(kernel_pmap); return (pa); } /* * Remove a wired page from kernel virtual address space. */ void moea64_kremove(mmu_t mmu, vm_offset_t va) { moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { size_t l; #ifdef __powerpc64__ struct slb *slb; #endif register_t slbv; *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK); l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr); if (l > ulen) l = ulen; if (klen) *klen = l; else if (l != ulen) return (EFAULT); #ifdef __powerpc64__ /* Try lockless look-up first */ slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr); if (slb == NULL) { /* If it isn't there, we need to pre-fault the VSID */ PMAP_LOCK(pm); slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT; PMAP_UNLOCK(pm); } else { slbv = slb->slbv; } /* Mark segment no-execute */ slbv |= SLBV_N; #else slbv = va_to_vsid(pm, (vm_offset_t)uaddr); /* Mark segment no-execute */ slbv |= SR_N; #endif /* If we have already set this VSID, we can just return */ if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv) return (0); __asm __volatile("isync"); curthread->td_pcb->pcb_cpu.aim.usr_segm = (uintptr_t)uaddr >> ADDR_SR_SHFT; curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv; #ifdef __powerpc64__ __asm __volatile ("slbie %0; slbmte %1, %2; isync" :: "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE)); #else __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv)); #endif return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { vm_offset_t user_sr; if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm; addr &= ADDR_PIDX | ADDR_POFF; addr |= user_sr << ADDR_SR_SHFT; *decoded_addr = addr; *is_user = 1; } else { *decoded_addr = addr; *is_user = 0; } return (0); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. Other architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped region. */ vm_offset_t moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva, va; if (hw_direct_map) { /* * Check if every page in the region is covered by the direct * map. The direct map covers all of physical memory. Use * moea64_calc_wimg() as a shortcut to see if the page is in * physical memory as a way to see if the direct map covers it. */ for (va = pa_start; va < pa_end; va += PAGE_SIZE) if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M) break; if (va == pa_end) return (PHYS_TO_DMAP(pa_start)); } sva = *virt; va = sva; /* XXX respect prot argument */ for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE) moea64_kenter(mmu, va, pa_start); *virt = va; return (sva); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { int loops; struct pvo_entry *pvo; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("moea64_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } PV_PAGE_UNLOCK(m); return (rv); } void moea64_page_init(mmu_t mmu __unused, vm_page_t m) { m->md.mdpg_attrs = 0; m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT; LIST_INIT(&m->md.mdpg_pvoh); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int moea64_page_wired_mappings(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED) count++; PV_PAGE_UNLOCK(m); return (count); } static uintptr_t moea64_vsidcontext; uintptr_t moea64_get_unique_vsid(void) { u_int entropy; register_t hash; uint32_t mask; int i; entropy = 0; __asm __volatile("mftb %0" : "=r"(entropy)); mtx_lock(&moea64_slb_mutex); for (i = 0; i < NVSIDS; i += VSID_NBPW) { u_int n; /* * Create a new value by mutiplying by a prime and adding in * entropy from the timebase register. This is to make the * VSID more random so that the PT hash function collides * less often. (Note that the prime casues gcc to do shifts * instead of a multiply.) */ moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy; hash = moea64_vsidcontext & (NVSIDS - 1); if (hash == 0) /* 0 is special, avoid it */ continue; n = hash >> 5; mask = 1 << (hash & (VSID_NBPW - 1)); hash = (moea64_vsidcontext & VSID_HASHMASK); if (moea64_vsid_bitmap[n] & mask) { /* collision? */ /* anything free in this bucket? */ if (moea64_vsid_bitmap[n] == 0xffffffff) { entropy = (moea64_vsidcontext >> 20); continue; } i = ffs(~moea64_vsid_bitmap[n]) - 1; mask = 1 << i; hash &= rounddown2(VSID_HASHMASK, VSID_NBPW); hash |= i; } if (hash == VSID_VRMA) /* also special, avoid this too */ continue; KASSERT(!(moea64_vsid_bitmap[n] & mask), ("Allocating in-use VSID %#zx\n", hash)); moea64_vsid_bitmap[n] |= mask; mtx_unlock(&moea64_slb_mutex); return (hash); } mtx_unlock(&moea64_slb_mutex); panic("%s: out of segments",__func__); } #ifdef __powerpc64__ void moea64_pinit(mmu_t mmu, pmap_t pmap) { RB_INIT(&pmap->pmap_pvo); pmap->pm_slb_tree_root = slb_alloc_tree(); pmap->pm_slb = slb_alloc_user_cache(); pmap->pm_slb_len = 0; } #else void moea64_pinit(mmu_t mmu, pmap_t pmap) { int i; uint32_t hash; RB_INIT(&pmap->pmap_pvo); if (pmap_bootstrapped) pmap->pmap_phys = (pmap_t)moea64_kextract(mmu, (vm_offset_t)pmap); else pmap->pmap_phys = pmap; /* * Allocate some segment registers for this pmap. */ hash = moea64_get_unique_vsid(); for (i = 0; i < 16; i++) pmap->pm_sr[i] = VSID_MAKE(i, hash); KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0")); } #endif /* * Initialize the pmap associated with process 0. */ void moea64_pinit0(mmu_t mmu, pmap_t pm) { PMAP_LOCK_INIT(pm); moea64_pinit(mmu, pm); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Set the physical protection on the specified range of this map as requested. */ static void moea64_pvo_protect(mmu_t mmu, pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot) { struct vm_page *pg; vm_prot_t oldprot; int32_t refchg; PMAP_LOCK_ASSERT(pm, MA_OWNED); /* * Change the protection of the page. */ oldprot = pvo->pvo_pte.prot; pvo->pvo_pte.prot = prot; pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); /* * If the PVO is in the page table, update mapping */ refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE); if (refchg < 0) refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0; - if (pm != kernel_pmap && pg != NULL && - (vm_page_aflags(pg) & PGA_EXECUTABLE) == 0 && + if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) { if ((pg->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(pg, PGA_EXECUTABLE); moea64_syncicache(mmu, pm, PVO_VADDR(pvo), pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE); } /* * Update vm about the REF/CHG bits if the page is managed and we have * removed write access. */ if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) && (oldprot & VM_PROT_WRITE)) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } void moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct pvo_entry *pvo, *tpvo, key; CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm, sva, eva, prot); KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap, ("moea64_protect: non current pmap")); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { moea64_remove(mmu, pm, sva, eva); return; } PMAP_LOCK(pm); key.pvo_vaddr = sva; for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); moea64_pvo_protect(mmu, pm, pvo, prot); } PMAP_UNLOCK(pm); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count) { while (count-- > 0) { moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by moea64_qenter. */ void moea64_qremove(mmu_t mmu, vm_offset_t va, int count) { while (count-- > 0) { moea64_kremove(mmu, va); va += PAGE_SIZE; } } void moea64_release_vsid(uint64_t vsid) { int idx, mask; mtx_lock(&moea64_slb_mutex); idx = vsid & (NVSIDS-1); mask = 1 << (idx % VSID_NBPW); idx /= VSID_NBPW; KASSERT(moea64_vsid_bitmap[idx] & mask, ("Freeing unallocated VSID %#jx", vsid)); moea64_vsid_bitmap[idx] &= ~mask; mtx_unlock(&moea64_slb_mutex); } void moea64_release(mmu_t mmu, pmap_t pmap) { /* * Free segment registers' VSIDs */ #ifdef __powerpc64__ slb_free_tree(pmap); slb_free_user_cache(pmap->pm_slb); #else KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0")); moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0])); #endif } /* * Remove all pages mapped by the specified pmap */ void moea64_remove_pages(mmu_t mmu, pmap_t pm) { struct pvo_entry *pvo, *tpvo; struct pvo_dlist tofree; SLIST_INIT(&tofree); PMAP_LOCK(pm); RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) { if (pvo->pvo_vaddr & PVO_WIRED) continue; /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(mmu, pvo); free_pvo_entry(pvo); } } /* * Remove the given range of addresses from the specified map. */ void moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct pvo_entry *pvo, *tpvo, key; struct pvo_dlist tofree; /* * Perform an unsynchronized read. This is, however, safe. */ if (pm->pm_stats.resident_count == 0) return; key.pvo_vaddr = sva; SLIST_INIT(&tofree); PMAP_LOCK(pm); for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key); pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) { tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo); /* * For locking reasons, remove this from the page table and * pmap, but save delinking from the vm_page for a second * pass */ moea64_pvo_remove_from_pmap(mmu, pvo); SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink); } PMAP_UNLOCK(pm); while (!SLIST_EMPTY(&tofree)) { pvo = SLIST_FIRST(&tofree); SLIST_REMOVE_HEAD(&tofree, pvo_dlink); moea64_pvo_remove_from_page(mmu, pvo); free_pvo_entry(pvo); } } /* * Remove physical page from all pmaps in which it resides. moea64_pvo_remove() * will reflect changes in pte's back to the vm_page. */ void moea64_remove_all(mmu_t mmu, vm_page_t m) { struct pvo_entry *pvo, *next_pvo; struct pvo_head freequeue; int wasdead; pmap_t pmap; LIST_INIT(&freequeue); PV_PAGE_LOCK(m); LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) { pmap = pvo->pvo_pmap; PMAP_LOCK(pmap); wasdead = (pvo->pvo_vaddr & PVO_DEAD); if (!wasdead) moea64_pvo_remove_from_pmap(mmu, pvo); moea64_pvo_remove_from_page_locked(mmu, pvo, m); if (!wasdead) LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink); PMAP_UNLOCK(pmap); } KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings")); - KASSERT((vm_page_aflags(m) & PGA_WRITEABLE) == 0, - ("Page still writable")); + KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable")); PV_PAGE_UNLOCK(m); /* Clean up UMA allocations */ LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo) free_pvo_entry(pvo); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from moea64_bootstrap before avail start and end are * calculated. */ vm_offset_t moea64_bootstrap_alloc(vm_size_t size, vm_size_t align) { vm_offset_t s, e; int i, j; size = round_page(size); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (align != 0) s = roundup2(phys_avail[i], align); else s = phys_avail[i]; e = s + size; if (s < phys_avail[i] || e > phys_avail[i + 1]) continue; if (s + size > platform_real_maxaddr()) continue; if (s == phys_avail[i]) { phys_avail[i] += size; } else if (e == phys_avail[i + 1]) { phys_avail[i + 1] -= size; } else { for (j = phys_avail_count * 2; j > i; j -= 2) { phys_avail[j] = phys_avail[j - 2]; phys_avail[j + 1] = phys_avail[j - 1]; } phys_avail[i + 3] = phys_avail[i + 1]; phys_avail[i + 1] = s; phys_avail[i + 2] = e; phys_avail_count++; } return (s); } panic("moea64_bootstrap_alloc: could not allocate memory"); } static int moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head, struct pvo_entry **oldpvop) { int first, err; struct pvo_entry *old_pvo; PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); STAT_MOEA64(moea64_pvo_enter_calls++); /* * Add to pmap list */ old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); if (old_pvo != NULL) { if (oldpvop != NULL) *oldpvop = old_pvo; return (EEXIST); } /* * Remember if the list was empty and therefore will be the first * item. */ if (pvo_head != NULL) { if (LIST_FIRST(pvo_head) == NULL) first = 1; LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink); } if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count++; pvo->pvo_pmap->pm_stats.resident_count++; /* * Insert it into the hardware page table */ err = MOEA64_PTE_INSERT(mmu, pvo); if (err != 0) { panic("moea64_pvo_enter: overflow"); } STAT_MOEA64(moea64_pvo_entries++); if (pvo->pvo_pmap == kernel_pmap) isync(); #ifdef __powerpc64__ /* * Make sure all our bootstrap mappings are in the SLB as soon * as virtual memory is switched on. */ if (!pmap_bootstrapped) moea64_bootstrap_slb_prefault(PVO_VADDR(pvo), pvo->pvo_vaddr & PVO_LARGE); #endif return (first ? ENOENT : 0); } static void moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo) { struct vm_page *pg; int32_t refchg; KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap")); PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO")); /* * If there is an active pte entry, we need to deactivate it */ refchg = MOEA64_PTE_UNSET(mmu, pvo); if (refchg < 0) { /* * If it was evicted from the page table, be pessimistic and * dirty the page. */ if (pvo->pvo_pte.prot & VM_PROT_WRITE) refchg = LPTE_CHG; else refchg = 0; } /* * Update our statistics. */ pvo->pvo_pmap->pm_stats.resident_count--; if (pvo->pvo_vaddr & PVO_WIRED) pvo->pvo_pmap->pm_stats.wired_count--; /* * Remove this PVO from the pmap list. */ RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo); /* * Mark this for the next sweep */ pvo->pvo_vaddr |= PVO_DEAD; /* Send RC bits to VM */ if ((pvo->pvo_vaddr & PVO_MANAGED) && (pvo->pvo_pte.prot & VM_PROT_WRITE)) { pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); if (pg != NULL) { refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs); if (refchg & LPTE_CHG) vm_page_dirty(pg); if (refchg & LPTE_REF) vm_page_aflag_set(pg, PGA_REFERENCED); } } } static inline void moea64_pvo_remove_from_page_locked(mmu_t mmu, struct pvo_entry *pvo, vm_page_t m) { KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page")); /* Use NULL pmaps as a sentinel for races in page deletion */ if (pvo->pvo_pmap == NULL) return; pvo->pvo_pmap = NULL; /* * Update vm about page writeability/executability if managed */ PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN); if (pvo->pvo_vaddr & PVO_MANAGED) { if (m != NULL) { LIST_REMOVE(pvo, pvo_vlink); if (LIST_EMPTY(vm_page_to_pvoh(m))) vm_page_aflag_clear(m, PGA_WRITEABLE | PGA_EXECUTABLE); } } STAT_MOEA64(moea64_pvo_entries--); STAT_MOEA64(moea64_pvo_remove_calls++); } static void moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo) { vm_page_t pg = NULL; if (pvo->pvo_vaddr & PVO_MANAGED) pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN); PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN); moea64_pvo_remove_from_page_locked(mmu, pvo, pg); PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN); } static struct pvo_entry * moea64_pvo_find_va(pmap_t pm, vm_offset_t va) { struct pvo_entry key; PMAP_LOCK_ASSERT(pm, MA_OWNED); key.pvo_vaddr = va & ~ADDR_POFF; return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key)); } static boolean_t moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit) { struct pvo_entry *pvo; int64_t ret; boolean_t rv; /* * See if this bit is stored in the page already. */ if (m->md.mdpg_attrs & ptebit) return (TRUE); /* * Examine each PTE. Sync so that any pending REF/CHG bits are * flushed to the PTEs. */ rv = FALSE; powerpc_sync(); PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; /* * See if this pvo has a valid PTE. if so, fetch the * REF/CHG bits from the valid PTE. If the appropriate * ptebit is set, return success. */ PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_SYNCH(mmu, pvo); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0) { atomic_set_32(&m->md.mdpg_attrs, ret & (LPTE_CHG | LPTE_REF)); if (ret & ptebit) { rv = TRUE; break; } } } PV_PAGE_UNLOCK(m); return (rv); } static u_int moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit) { u_int count; struct pvo_entry *pvo; int64_t ret; /* * Sync so that any pending REF/CHG bits are flushed to the PTEs (so * we can reset the right ones). */ powerpc_sync(); /* * For each pvo entry, clear the pte's ptebit. */ count = 0; PV_PAGE_LOCK(m); LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) { ret = 0; PMAP_LOCK(pvo->pvo_pmap); if (!(pvo->pvo_vaddr & PVO_DEAD)) ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit); PMAP_UNLOCK(pvo->pvo_pmap); if (ret > 0 && (ret & ptebit)) count++; } atomic_clear_32(&m->md.mdpg_attrs, ptebit); PV_PAGE_UNLOCK(m); return (count); } boolean_t moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { struct pvo_entry *pvo, key; vm_offset_t ppa; int error = 0; if (hw_direct_map && mem_valid(pa, size) == 0) return (0); PMAP_LOCK(kernel_pmap); ppa = pa & ~ADDR_POFF; key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa; for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key); ppa < pa + size; ppa += PAGE_SIZE, pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) { if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) { error = EFAULT; break; } } PMAP_UNLOCK(kernel_pmap); return (error); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * moea64_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { vm_offset_t va, tmpva, ppa, offset; ppa = trunc_page(pa); offset = pa & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); va = kva_alloc(size); if (!va) panic("moea64_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { moea64_kenter_attr(mmu, tmpva, ppa, ma); size -= PAGE_SIZE; tmpva += PAGE_SIZE; ppa += PAGE_SIZE; } return ((void *)(va + offset)); } void * moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT); } void moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { vm_offset_t base, offset; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup2(offset + size, PAGE_SIZE); kva_free(base, size); } void moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { struct pvo_entry *pvo; vm_offset_t lim; vm_paddr_t pa; vm_size_t len; if (__predict_false(pm == NULL)) pm = &curthread->td_proc->p_vmspace->vm_pmap; PMAP_LOCK(pm); while (sz > 0) { lim = round_page(va+1); len = MIN(lim - va, sz); pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) { pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF); moea64_syncicache(mmu, pm, va, pa, len); } va += len; sz -= len; } PMAP_UNLOCK(pm); } void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { *va = (void *)(uintptr_t)pa; } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void moea64_scan_init(mmu_t mmu) { struct pvo_entry *pvo; vm_offset_t va; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&pregions, &pregions_sz, ®ions, ®ions_sz); for (i = 0; i < pregions_sz; i++) { dump_map[i].pa_start = pregions[i].mr_start; dump_map[i].pa_size = pregions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr; dump_map[1].pa_size = round_page(msgbufp->msg_size); /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF); if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } Index: head/sys/powerpc/booke/pmap.c =================================================================== --- head/sys/powerpc/booke/pmap.c (revision 352406) +++ head/sys/powerpc/booke/pmap.c (revision 352407) @@ -1,4376 +1,4376 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2007-2009 Semihalf, Rafal Jaworowski * Copyright (C) 2006 Semihalf, Marian Balakowicz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Some hw specific parts of this pmap were derived or influenced * by NetBSD's ibm4xx pmap module. More generic code is shared with * a few other pmap modules from the FreeBSD tree. */ /* * VM layout notes: * * Kernel and user threads run within one common virtual address space * defined by AS=0. * * 32-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000 - 0x7fff_ffff : user process * 0x8000_0000 - 0xbfff_ffff : pmap_mapdev()-ed area (PCI/PCIE etc.) * 0xc000_0000 - 0xc0ff_ffff : kernel reserved * 0xc000_0000 - data_end : kernel code+data, env, metadata etc. * 0xc100_0000 - 0xffff_ffff : KVA * 0xc100_0000 - 0xc100_3fff : reserved for page zero/copy * 0xc100_4000 - 0xc200_3fff : reserved for ptbl bufs * 0xc200_4000 - 0xc200_8fff : guard page + kstack0 * 0xc200_9000 - 0xfeef_ffff : actual free KVA space * * 64-bit pmap: * Virtual address space layout: * ----------------------------- * 0x0000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : user process * 0x0000_0000_0000_0000 - 0x8fff_ffff_ffff_ffff : text, data, heap, maps, libraries * 0x9000_0000_0000_0000 - 0xafff_ffff_ffff_ffff : mmio region * 0xb000_0000_0000_0000 - 0xbfff_ffff_ffff_ffff : stack * 0xc000_0000_0000_0000 - 0xcfff_ffff_ffff_ffff : kernel reserved * 0xc000_0000_0000_0000 - endkernel-1 : kernel code & data * endkernel - msgbufp-1 : flat device tree * msgbufp - kernel_pdir-1 : message buffer * kernel_pdir - kernel_pp2d-1 : kernel page directory * kernel_pp2d - . : kernel pointers to page directory * pmap_zero_copy_min - crashdumpmap-1 : reserved for page zero/copy * crashdumpmap - ptbl_buf_pool_vabase-1 : reserved for ptbl bufs * ptbl_buf_pool_vabase - virtual_avail-1 : user page directories and page tables * virtual_avail - 0xcfff_ffff_ffff_ffff : actual free KVA space * 0xd000_0000_0000_0000 - 0xdfff_ffff_ffff_ffff : coprocessor region * 0xe000_0000_0000_0000 - 0xefff_ffff_ffff_ffff : mmio region * 0xf000_0000_0000_0000 - 0xffff_ffff_ffff_ffff : direct map * 0xf000_0000_0000_0000 - +Maxmem : physmem map * - 0xffff_ffff_ffff_ffff : device direct map */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu_if.h" #define SPARSE_MAPDEV #ifdef DEBUG #define debugf(fmt, args...) printf(fmt, ##args) #else #define debugf(fmt, args...) #endif #ifdef __powerpc64__ #define PRI0ptrX "016lx" #else #define PRI0ptrX "08x" #endif #define TODO panic("%s: not implemented", __func__); extern unsigned char _etext[]; extern unsigned char _end[]; extern uint32_t *bootinfo; vm_paddr_t kernload; vm_offset_t kernstart; vm_size_t kernsize; /* Message buffer and tables. */ static vm_offset_t data_start; static vm_size_t data_end; /* Phys/avail memory regions. */ static struct mem_region *availmem_regions; static int availmem_regions_sz; static struct mem_region *physmem_regions; static int physmem_regions_sz; #ifndef __powerpc64__ /* Reserved KVA space and mutex for mmu_booke_zero_page. */ static vm_offset_t zero_page_va; static struct mtx zero_page_mutex; /* Reserved KVA space and mutex for mmu_booke_copy_page. */ static vm_offset_t copy_page_src_va; static vm_offset_t copy_page_dst_va; static struct mtx copy_page_mutex; #endif static struct mtx tlbivax_mutex; /**************************************************************************/ /* PMAP */ /**************************************************************************/ static int mmu_booke_enter_locked(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); unsigned int kptbl_min; /* Index of the first kernel ptbl. */ unsigned int kernel_ptbls; /* Number of KVA ptbls. */ #ifdef __powerpc64__ unsigned int kernel_pdirs; #endif static uma_zone_t ptbl_root_zone; /* * If user pmap is processed with mmu_booke_remove and the resident count * drops to 0, there are no more pages to remove, so we need not continue. */ #define PMAP_REMOVE_DONE(pmap) \ ((pmap) != kernel_pmap && (pmap)->pm_stats.resident_count == 0) #if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__) extern int elf32_nxstack; #endif /**************************************************************************/ /* TLB and TID handling */ /**************************************************************************/ /* Translation ID busy table */ static volatile pmap_t tidbusy[MAXCPU][TID_MAX + 1]; /* * TLB0 capabilities (entry, way numbers etc.). These can vary between e500 * core revisions and should be read from h/w registers during early config. */ uint32_t tlb0_entries; uint32_t tlb0_ways; uint32_t tlb0_entries_per_way; uint32_t tlb1_entries; #define TLB0_ENTRIES (tlb0_entries) #define TLB0_WAYS (tlb0_ways) #define TLB0_ENTRIES_PER_WAY (tlb0_entries_per_way) #define TLB1_ENTRIES (tlb1_entries) static vm_offset_t tlb1_map_base = (vm_offset_t)VM_MAXUSER_ADDRESS + PAGE_SIZE; static tlbtid_t tid_alloc(struct pmap *); static void tid_flush(tlbtid_t tid); #ifdef DDB #ifdef __powerpc64__ static void tlb_print_entry(int, uint32_t, uint64_t, uint32_t, uint32_t); #else static void tlb_print_entry(int, uint32_t, uint32_t, uint32_t, uint32_t); #endif #endif static void tlb1_read_entry(tlb_entry_t *, unsigned int); static void tlb1_write_entry(tlb_entry_t *, unsigned int); static int tlb1_iomapped(int, vm_paddr_t, vm_size_t, vm_offset_t *); static vm_size_t tlb1_mapin_region(vm_offset_t, vm_paddr_t, vm_size_t); static vm_size_t tsize2size(unsigned int); static unsigned int size2tsize(vm_size_t); static unsigned int ilog2(unsigned long); static void set_mas4_defaults(void); static inline void tlb0_flush_entry(vm_offset_t); static inline unsigned int tlb0_tableidx(vm_offset_t, unsigned int); /**************************************************************************/ /* Page table management */ /**************************************************************************/ static struct rwlock_padalign pvh_global_lock; /* Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; #define PV_ENTRY_ZONE_MIN 2048 /* min pv entries in uma zone */ #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #ifdef __powerpc64__ #define PMAP_ROOT_SIZE (sizeof(pte_t***) * PP2D_NENTRIES) static pte_t *ptbl_alloc(mmu_t, pmap_t, pte_t **, unsigned int, boolean_t); static void ptbl_free(mmu_t, pmap_t, pte_t **, unsigned int, vm_page_t); static void ptbl_hold(mmu_t, pmap_t, pte_t **, unsigned int); static int ptbl_unhold(mmu_t, pmap_t, vm_offset_t); #else #define PMAP_ROOT_SIZE (sizeof(pte_t**) * PDIR_NENTRIES) static void ptbl_init(void); static struct ptbl_buf *ptbl_buf_alloc(void); static void ptbl_buf_free(struct ptbl_buf *); static void ptbl_free_pmap_ptbl(pmap_t, pte_t *); static pte_t *ptbl_alloc(mmu_t, pmap_t, unsigned int, boolean_t); static void ptbl_free(mmu_t, pmap_t, unsigned int); static void ptbl_hold(mmu_t, pmap_t, unsigned int); static int ptbl_unhold(mmu_t, pmap_t, unsigned int); #endif static vm_paddr_t pte_vatopa(mmu_t, pmap_t, vm_offset_t); static int pte_enter(mmu_t, pmap_t, vm_page_t, vm_offset_t, uint32_t, boolean_t); static int pte_remove(mmu_t, pmap_t, vm_offset_t, uint8_t); static pte_t *pte_find(mmu_t, pmap_t, vm_offset_t); static void kernel_pte_alloc(vm_offset_t, vm_offset_t, vm_offset_t); static pv_entry_t pv_alloc(void); static void pv_free(pv_entry_t); static void pv_insert(pmap_t, vm_offset_t, vm_page_t); static void pv_remove(pmap_t, vm_offset_t, vm_page_t); static void booke_pmap_init_qpages(void); struct ptbl_buf { TAILQ_ENTRY(ptbl_buf) link; /* list link */ vm_offset_t kva; /* va of mapping */ }; #ifndef __powerpc64__ /* Number of kva ptbl buffers, each covering one ptbl (PTBL_PAGES). */ #define PTBL_BUFS (128 * 16) /* ptbl free list and a lock used for access synchronization. */ static TAILQ_HEAD(, ptbl_buf) ptbl_buf_freelist; static struct mtx ptbl_buf_freelist_lock; /* Base address of kva space allocated fot ptbl bufs. */ static vm_offset_t ptbl_buf_pool_vabase; /* Pointer to ptbl_buf structures. */ static struct ptbl_buf *ptbl_bufs; #endif #ifdef SMP extern tlb_entry_t __boot_tlb1[]; void pmap_bootstrap_ap(volatile uint32_t *); #endif /* * Kernel MMU interface */ static void mmu_booke_clear_modify(mmu_t, vm_page_t); static void mmu_booke_copy(mmu_t, pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); static void mmu_booke_copy_page(mmu_t, vm_page_t, vm_page_t); static void mmu_booke_copy_pages(mmu_t, vm_page_t *, vm_offset_t, vm_page_t *, vm_offset_t, int); static int mmu_booke_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int flags, int8_t psind); static void mmu_booke_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t, vm_prot_t); static void mmu_booke_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t); static vm_paddr_t mmu_booke_extract(mmu_t, pmap_t, vm_offset_t); static vm_page_t mmu_booke_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t); static void mmu_booke_init(mmu_t); static boolean_t mmu_booke_is_modified(mmu_t, vm_page_t); static boolean_t mmu_booke_is_prefaultable(mmu_t, pmap_t, vm_offset_t); static boolean_t mmu_booke_is_referenced(mmu_t, vm_page_t); static int mmu_booke_ts_referenced(mmu_t, vm_page_t); static vm_offset_t mmu_booke_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int); static int mmu_booke_mincore(mmu_t, pmap_t, vm_offset_t, vm_paddr_t *); static void mmu_booke_object_init_pt(mmu_t, pmap_t, vm_offset_t, vm_object_t, vm_pindex_t, vm_size_t); static boolean_t mmu_booke_page_exists_quick(mmu_t, pmap_t, vm_page_t); static void mmu_booke_page_init(mmu_t, vm_page_t); static int mmu_booke_page_wired_mappings(mmu_t, vm_page_t); static void mmu_booke_pinit(mmu_t, pmap_t); static void mmu_booke_pinit0(mmu_t, pmap_t); static void mmu_booke_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); static void mmu_booke_qenter(mmu_t, vm_offset_t, vm_page_t *, int); static void mmu_booke_qremove(mmu_t, vm_offset_t, int); static void mmu_booke_release(mmu_t, pmap_t); static void mmu_booke_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t); static void mmu_booke_remove_all(mmu_t, vm_page_t); static void mmu_booke_remove_write(mmu_t, vm_page_t); static void mmu_booke_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t); static void mmu_booke_zero_page(mmu_t, vm_page_t); static void mmu_booke_zero_page_area(mmu_t, vm_page_t, int, int); static void mmu_booke_activate(mmu_t, struct thread *); static void mmu_booke_deactivate(mmu_t, struct thread *); static void mmu_booke_bootstrap(mmu_t, vm_offset_t, vm_offset_t); static void *mmu_booke_mapdev(mmu_t, vm_paddr_t, vm_size_t); static void *mmu_booke_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t); static void mmu_booke_unmapdev(mmu_t, vm_offset_t, vm_size_t); static vm_paddr_t mmu_booke_kextract(mmu_t, vm_offset_t); static void mmu_booke_kenter(mmu_t, vm_offset_t, vm_paddr_t); static void mmu_booke_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t); static void mmu_booke_kremove(mmu_t, vm_offset_t); static boolean_t mmu_booke_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t); static void mmu_booke_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t); static void mmu_booke_dumpsys_map(mmu_t, vm_paddr_t pa, size_t, void **); static void mmu_booke_dumpsys_unmap(mmu_t, vm_paddr_t pa, size_t, void *); static void mmu_booke_scan_init(mmu_t); static vm_offset_t mmu_booke_quick_enter_page(mmu_t mmu, vm_page_t m); static void mmu_booke_quick_remove_page(mmu_t mmu, vm_offset_t addr); static int mmu_booke_change_attr(mmu_t mmu, vm_offset_t addr, vm_size_t sz, vm_memattr_t mode); static int mmu_booke_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen); static int mmu_booke_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr); static mmu_method_t mmu_booke_methods[] = { /* pmap dispatcher interface */ MMUMETHOD(mmu_clear_modify, mmu_booke_clear_modify), MMUMETHOD(mmu_copy, mmu_booke_copy), MMUMETHOD(mmu_copy_page, mmu_booke_copy_page), MMUMETHOD(mmu_copy_pages, mmu_booke_copy_pages), MMUMETHOD(mmu_enter, mmu_booke_enter), MMUMETHOD(mmu_enter_object, mmu_booke_enter_object), MMUMETHOD(mmu_enter_quick, mmu_booke_enter_quick), MMUMETHOD(mmu_extract, mmu_booke_extract), MMUMETHOD(mmu_extract_and_hold, mmu_booke_extract_and_hold), MMUMETHOD(mmu_init, mmu_booke_init), MMUMETHOD(mmu_is_modified, mmu_booke_is_modified), MMUMETHOD(mmu_is_prefaultable, mmu_booke_is_prefaultable), MMUMETHOD(mmu_is_referenced, mmu_booke_is_referenced), MMUMETHOD(mmu_ts_referenced, mmu_booke_ts_referenced), MMUMETHOD(mmu_map, mmu_booke_map), MMUMETHOD(mmu_mincore, mmu_booke_mincore), MMUMETHOD(mmu_object_init_pt, mmu_booke_object_init_pt), MMUMETHOD(mmu_page_exists_quick,mmu_booke_page_exists_quick), MMUMETHOD(mmu_page_init, mmu_booke_page_init), MMUMETHOD(mmu_page_wired_mappings, mmu_booke_page_wired_mappings), MMUMETHOD(mmu_pinit, mmu_booke_pinit), MMUMETHOD(mmu_pinit0, mmu_booke_pinit0), MMUMETHOD(mmu_protect, mmu_booke_protect), MMUMETHOD(mmu_qenter, mmu_booke_qenter), MMUMETHOD(mmu_qremove, mmu_booke_qremove), MMUMETHOD(mmu_release, mmu_booke_release), MMUMETHOD(mmu_remove, mmu_booke_remove), MMUMETHOD(mmu_remove_all, mmu_booke_remove_all), MMUMETHOD(mmu_remove_write, mmu_booke_remove_write), MMUMETHOD(mmu_sync_icache, mmu_booke_sync_icache), MMUMETHOD(mmu_unwire, mmu_booke_unwire), MMUMETHOD(mmu_zero_page, mmu_booke_zero_page), MMUMETHOD(mmu_zero_page_area, mmu_booke_zero_page_area), MMUMETHOD(mmu_activate, mmu_booke_activate), MMUMETHOD(mmu_deactivate, mmu_booke_deactivate), MMUMETHOD(mmu_quick_enter_page, mmu_booke_quick_enter_page), MMUMETHOD(mmu_quick_remove_page, mmu_booke_quick_remove_page), /* Internal interfaces */ MMUMETHOD(mmu_bootstrap, mmu_booke_bootstrap), MMUMETHOD(mmu_dev_direct_mapped,mmu_booke_dev_direct_mapped), MMUMETHOD(mmu_mapdev, mmu_booke_mapdev), MMUMETHOD(mmu_mapdev_attr, mmu_booke_mapdev_attr), MMUMETHOD(mmu_kenter, mmu_booke_kenter), MMUMETHOD(mmu_kenter_attr, mmu_booke_kenter_attr), MMUMETHOD(mmu_kextract, mmu_booke_kextract), MMUMETHOD(mmu_kremove, mmu_booke_kremove), MMUMETHOD(mmu_unmapdev, mmu_booke_unmapdev), MMUMETHOD(mmu_change_attr, mmu_booke_change_attr), MMUMETHOD(mmu_map_user_ptr, mmu_booke_map_user_ptr), MMUMETHOD(mmu_decode_kernel_ptr, mmu_booke_decode_kernel_ptr), /* dumpsys() support */ MMUMETHOD(mmu_dumpsys_map, mmu_booke_dumpsys_map), MMUMETHOD(mmu_dumpsys_unmap, mmu_booke_dumpsys_unmap), MMUMETHOD(mmu_scan_init, mmu_booke_scan_init), { 0, 0 } }; MMU_DEF(booke_mmu, MMU_TYPE_BOOKE, mmu_booke_methods, 0); static __inline uint32_t tlb_calc_wimg(vm_paddr_t pa, vm_memattr_t ma) { uint32_t attrib; int i; if (ma != VM_MEMATTR_DEFAULT) { switch (ma) { case VM_MEMATTR_UNCACHEABLE: return (MAS2_I | MAS2_G); case VM_MEMATTR_WRITE_COMBINING: case VM_MEMATTR_WRITE_BACK: case VM_MEMATTR_PREFETCHABLE: return (MAS2_I); case VM_MEMATTR_WRITE_THROUGH: return (MAS2_W | MAS2_M); case VM_MEMATTR_CACHEABLE: return (MAS2_M); } } /* * Assume the page is cache inhibited and access is guarded unless * it's in our available memory array. */ attrib = _TLB_ENTRY_IO; for (i = 0; i < physmem_regions_sz; i++) { if ((pa >= physmem_regions[i].mr_start) && (pa < (physmem_regions[i].mr_start + physmem_regions[i].mr_size))) { attrib = _TLB_ENTRY_MEM; break; } } return (attrib); } static inline void tlb_miss_lock(void) { #ifdef SMP struct pcpu *pc; if (!smp_started) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) { CTR3(KTR_PMAP, "%s: tlb miss LOCK of CPU=%d, " "tlb_lock=%p", __func__, pc->pc_cpuid, pc->pc_booke.tlb_lock); KASSERT((pc->pc_cpuid != PCPU_GET(cpuid)), ("tlb_miss_lock: tried to lock self")); tlb_lock(pc->pc_booke.tlb_lock); CTR1(KTR_PMAP, "%s: locked", __func__); } } #endif } static inline void tlb_miss_unlock(void) { #ifdef SMP struct pcpu *pc; if (!smp_started) return; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc != pcpup) { CTR2(KTR_PMAP, "%s: tlb miss UNLOCK of CPU=%d", __func__, pc->pc_cpuid); tlb_unlock(pc->pc_booke.tlb_lock); CTR1(KTR_PMAP, "%s: unlocked", __func__); } } #endif } /* Return number of entries in TLB0. */ static __inline void tlb0_get_tlbconf(void) { uint32_t tlb0_cfg; tlb0_cfg = mfspr(SPR_TLB0CFG); tlb0_entries = tlb0_cfg & TLBCFG_NENTRY_MASK; tlb0_ways = (tlb0_cfg & TLBCFG_ASSOC_MASK) >> TLBCFG_ASSOC_SHIFT; tlb0_entries_per_way = tlb0_entries / tlb0_ways; } /* Return number of entries in TLB1. */ static __inline void tlb1_get_tlbconf(void) { uint32_t tlb1_cfg; tlb1_cfg = mfspr(SPR_TLB1CFG); tlb1_entries = tlb1_cfg & TLBCFG_NENTRY_MASK; } /**************************************************************************/ /* Page table related */ /**************************************************************************/ #ifdef __powerpc64__ /* Initialize pool of kva ptbl buffers. */ static void ptbl_init(void) { } /* Get a pointer to a PTE in a page table. */ static __inline pte_t * pte_find(mmu_t mmu, pmap_t pmap, vm_offset_t va) { pte_t **pdir; pte_t *ptbl; KASSERT((pmap != NULL), ("pte_find: invalid pmap")); pdir = pmap->pm_pp2d[PP2D_IDX(va)]; if (!pdir) return NULL; ptbl = pdir[PDIR_IDX(va)]; return ((ptbl != NULL) ? &ptbl[PTBL_IDX(va)] : NULL); } /* * allocate a page of pointers to page directories, do not preallocate the * page tables */ static pte_t ** pdir_alloc(mmu_t mmu, pmap_t pmap, unsigned int pp2d_idx, bool nosleep) { vm_page_t m; pte_t **pdir; int req; req = VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; while ((m = vm_page_alloc(NULL, pp2d_idx, req)) == NULL) { PMAP_UNLOCK(pmap); if (nosleep) { return (NULL); } vm_wait(NULL); PMAP_LOCK(pmap); } /* Zero whole ptbl. */ pdir = (pte_t **)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); mmu_booke_zero_page(mmu, m); return (pdir); } /* Free pdir pages and invalidate pdir entry. */ static void pdir_free(mmu_t mmu, pmap_t pmap, unsigned int pp2d_idx, vm_page_t m) { pte_t **pdir; pdir = pmap->pm_pp2d[pp2d_idx]; KASSERT((pdir != NULL), ("pdir_free: null pdir")); pmap->pm_pp2d[pp2d_idx] = NULL; vm_wire_sub(1); vm_page_free_zero(m); } /* * Decrement pdir pages hold count and attempt to free pdir pages. Called * when removing directory entry from pdir. * * Return 1 if pdir pages were freed. */ static int pdir_unhold(mmu_t mmu, pmap_t pmap, u_int pp2d_idx) { pte_t **pdir; vm_paddr_t pa; vm_page_t m; KASSERT((pmap != kernel_pmap), ("pdir_unhold: unholding kernel pdir!")); pdir = pmap->pm_pp2d[pp2d_idx]; /* decrement hold count */ pa = DMAP_TO_PHYS((vm_offset_t) pdir); m = PHYS_TO_VM_PAGE(pa); /* * Free pdir page if there are no dir entries in this pdir. */ m->wire_count--; if (m->wire_count == 0) { pdir_free(mmu, pmap, pp2d_idx, m); return (1); } return (0); } /* * Increment hold count for pdir pages. This routine is used when new ptlb * entry is being inserted into pdir. */ static void pdir_hold(mmu_t mmu, pmap_t pmap, pte_t ** pdir) { vm_page_t m; KASSERT((pmap != kernel_pmap), ("pdir_hold: holding kernel pdir!")); KASSERT((pdir != NULL), ("pdir_hold: null pdir")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pdir)); m->wire_count++; } /* Allocate page table. */ static pte_t * ptbl_alloc(mmu_t mmu, pmap_t pmap, pte_t ** pdir, unsigned int pdir_idx, boolean_t nosleep) { vm_page_t m; pte_t *ptbl; int req; KASSERT((pdir[pdir_idx] == NULL), ("%s: valid ptbl entry exists!", __func__)); req = VM_ALLOC_NOOBJ | VM_ALLOC_WIRED; while ((m = vm_page_alloc(NULL, pdir_idx, req)) == NULL) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); if (nosleep) { return (NULL); } vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* Zero whole ptbl. */ ptbl = (pte_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); mmu_booke_zero_page(mmu, m); return (ptbl); } /* Free ptbl pages and invalidate pdir entry. */ static void ptbl_free(mmu_t mmu, pmap_t pmap, pte_t ** pdir, unsigned int pdir_idx, vm_page_t m) { pte_t *ptbl; ptbl = pdir[pdir_idx]; KASSERT((ptbl != NULL), ("ptbl_free: null ptbl")); pdir[pdir_idx] = NULL; vm_wire_sub(1); vm_page_free_zero(m); } /* * Decrement ptbl pages hold count and attempt to free ptbl pages. Called * when removing pte entry from ptbl. * * Return 1 if ptbl pages were freed. */ static int ptbl_unhold(mmu_t mmu, pmap_t pmap, vm_offset_t va) { pte_t *ptbl; vm_page_t m; u_int pp2d_idx; pte_t **pdir; u_int pdir_idx; pp2d_idx = PP2D_IDX(va); pdir_idx = PDIR_IDX(va); KASSERT((pmap != kernel_pmap), ("ptbl_unhold: unholding kernel ptbl!")); pdir = pmap->pm_pp2d[pp2d_idx]; ptbl = pdir[pdir_idx]; /* decrement hold count */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) ptbl)); /* * Free ptbl pages if there are no pte entries in this ptbl. * wire_count has the same value for all ptbl pages, so check the * last page. */ m->wire_count--; if (m->wire_count == 0) { ptbl_free(mmu, pmap, pdir, pdir_idx, m); pdir_unhold(mmu, pmap, pp2d_idx); return (1); } return (0); } /* * Increment hold count for ptbl pages. This routine is used when new pte * entry is being inserted into ptbl. */ static void ptbl_hold(mmu_t mmu, pmap_t pmap, pte_t ** pdir, unsigned int pdir_idx) { pte_t *ptbl; vm_page_t m; KASSERT((pmap != kernel_pmap), ("ptbl_hold: holding kernel ptbl!")); ptbl = pdir[pdir_idx]; KASSERT((ptbl != NULL), ("ptbl_hold: null ptbl")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t) ptbl)); m->wire_count++; } #else /* Initialize pool of kva ptbl buffers. */ static void ptbl_init(void) { int i; CTR3(KTR_PMAP, "%s: s (ptbl_bufs = 0x%08x size 0x%08x)", __func__, (uint32_t)ptbl_bufs, sizeof(struct ptbl_buf) * PTBL_BUFS); CTR3(KTR_PMAP, "%s: s (ptbl_buf_pool_vabase = 0x%08x size = 0x%08x)", __func__, ptbl_buf_pool_vabase, PTBL_BUFS * PTBL_PAGES * PAGE_SIZE); mtx_init(&ptbl_buf_freelist_lock, "ptbl bufs lock", NULL, MTX_DEF); TAILQ_INIT(&ptbl_buf_freelist); for (i = 0; i < PTBL_BUFS; i++) { ptbl_bufs[i].kva = ptbl_buf_pool_vabase + i * PTBL_PAGES * PAGE_SIZE; TAILQ_INSERT_TAIL(&ptbl_buf_freelist, &ptbl_bufs[i], link); } } /* Get a ptbl_buf from the freelist. */ static struct ptbl_buf * ptbl_buf_alloc(void) { struct ptbl_buf *buf; mtx_lock(&ptbl_buf_freelist_lock); buf = TAILQ_FIRST(&ptbl_buf_freelist); if (buf != NULL) TAILQ_REMOVE(&ptbl_buf_freelist, buf, link); mtx_unlock(&ptbl_buf_freelist_lock); CTR2(KTR_PMAP, "%s: buf = %p", __func__, buf); return (buf); } /* Return ptbl buff to free pool. */ static void ptbl_buf_free(struct ptbl_buf *buf) { CTR2(KTR_PMAP, "%s: buf = %p", __func__, buf); mtx_lock(&ptbl_buf_freelist_lock); TAILQ_INSERT_TAIL(&ptbl_buf_freelist, buf, link); mtx_unlock(&ptbl_buf_freelist_lock); } /* * Search the list of allocated ptbl bufs and find on list of allocated ptbls */ static void ptbl_free_pmap_ptbl(pmap_t pmap, pte_t *ptbl) { struct ptbl_buf *pbuf; CTR2(KTR_PMAP, "%s: ptbl = %p", __func__, ptbl); PMAP_LOCK_ASSERT(pmap, MA_OWNED); TAILQ_FOREACH(pbuf, &pmap->pm_ptbl_list, link) if (pbuf->kva == (vm_offset_t)ptbl) { /* Remove from pmap ptbl buf list. */ TAILQ_REMOVE(&pmap->pm_ptbl_list, pbuf, link); /* Free corresponding ptbl buf. */ ptbl_buf_free(pbuf); break; } } /* Allocate page table. */ static pte_t * ptbl_alloc(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx, boolean_t nosleep) { vm_page_t mtbl[PTBL_PAGES]; vm_page_t m; struct ptbl_buf *pbuf; unsigned int pidx; pte_t *ptbl; int i, j; CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap, (pmap == kernel_pmap), pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_alloc: invalid pdir_idx")); KASSERT((pmap->pm_pdir[pdir_idx] == NULL), ("pte_alloc: valid ptbl entry exists!")); pbuf = ptbl_buf_alloc(); if (pbuf == NULL) panic("pte_alloc: couldn't alloc kernel virtual memory"); ptbl = (pte_t *)pbuf->kva; CTR2(KTR_PMAP, "%s: ptbl kva = %p", __func__, ptbl); for (i = 0; i < PTBL_PAGES; i++) { pidx = (PTBL_PAGES * pdir_idx) + i; while ((m = vm_page_alloc(NULL, pidx, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); if (nosleep) { ptbl_free_pmap_ptbl(pmap, ptbl); for (j = 0; j < i; j++) vm_page_free(mtbl[j]); vm_wire_sub(i); return (NULL); } vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } mtbl[i] = m; } /* Map allocated pages into kernel_pmap. */ mmu_booke_qenter(mmu, (vm_offset_t)ptbl, mtbl, PTBL_PAGES); /* Zero whole ptbl. */ bzero((caddr_t)ptbl, PTBL_PAGES * PAGE_SIZE); /* Add pbuf to the pmap ptbl bufs list. */ TAILQ_INSERT_TAIL(&pmap->pm_ptbl_list, pbuf, link); return (ptbl); } /* Free ptbl pages and invalidate pdir entry. */ static void ptbl_free(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx) { pte_t *ptbl; vm_paddr_t pa; vm_offset_t va; vm_page_t m; int i; CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap, (pmap == kernel_pmap), pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_free: invalid pdir_idx")); ptbl = pmap->pm_pdir[pdir_idx]; CTR2(KTR_PMAP, "%s: ptbl = %p", __func__, ptbl); KASSERT((ptbl != NULL), ("ptbl_free: null ptbl")); /* * Invalidate the pdir entry as soon as possible, so that other CPUs * don't attempt to look up the page tables we are releasing. */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); pmap->pm_pdir[pdir_idx] = NULL; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); for (i = 0; i < PTBL_PAGES; i++) { va = ((vm_offset_t)ptbl + (i * PAGE_SIZE)); pa = pte_vatopa(mmu, kernel_pmap, va); m = PHYS_TO_VM_PAGE(pa); vm_page_free_zero(m); vm_wire_sub(1); mmu_booke_kremove(mmu, va); } ptbl_free_pmap_ptbl(pmap, ptbl); } /* * Decrement ptbl pages hold count and attempt to free ptbl pages. * Called when removing pte entry from ptbl. * * Return 1 if ptbl pages were freed. */ static int ptbl_unhold(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx) { pte_t *ptbl; vm_paddr_t pa; vm_page_t m; int i; CTR4(KTR_PMAP, "%s: pmap = %p su = %d pdir_idx = %d", __func__, pmap, (pmap == kernel_pmap), pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_unhold: invalid pdir_idx")); KASSERT((pmap != kernel_pmap), ("ptbl_unhold: unholding kernel ptbl!")); ptbl = pmap->pm_pdir[pdir_idx]; //debugf("ptbl_unhold: ptbl = 0x%08x\n", (u_int32_t)ptbl); KASSERT(((vm_offset_t)ptbl >= VM_MIN_KERNEL_ADDRESS), ("ptbl_unhold: non kva ptbl")); /* decrement hold count */ for (i = 0; i < PTBL_PAGES; i++) { pa = pte_vatopa(mmu, kernel_pmap, (vm_offset_t)ptbl + (i * PAGE_SIZE)); m = PHYS_TO_VM_PAGE(pa); m->wire_count--; } /* * Free ptbl pages if there are no pte etries in this ptbl. * wire_count has the same value for all ptbl pages, so check the last * page. */ if (m->wire_count == 0) { ptbl_free(mmu, pmap, pdir_idx); //debugf("ptbl_unhold: e (freed ptbl)\n"); return (1); } return (0); } /* * Increment hold count for ptbl pages. This routine is used when a new pte * entry is being inserted into the ptbl. */ static void ptbl_hold(mmu_t mmu, pmap_t pmap, unsigned int pdir_idx) { vm_paddr_t pa; pte_t *ptbl; vm_page_t m; int i; CTR3(KTR_PMAP, "%s: pmap = %p pdir_idx = %d", __func__, pmap, pdir_idx); KASSERT((pdir_idx <= (VM_MAXUSER_ADDRESS / PDIR_SIZE)), ("ptbl_hold: invalid pdir_idx")); KASSERT((pmap != kernel_pmap), ("ptbl_hold: holding kernel ptbl!")); ptbl = pmap->pm_pdir[pdir_idx]; KASSERT((ptbl != NULL), ("ptbl_hold: null ptbl")); for (i = 0; i < PTBL_PAGES; i++) { pa = pte_vatopa(mmu, kernel_pmap, (vm_offset_t)ptbl + (i * PAGE_SIZE)); m = PHYS_TO_VM_PAGE(pa); m->wire_count++; } } #endif /* Allocate pv_entry structure. */ pv_entry_t pv_alloc(void) { pv_entry_t pv; pv_entry_count++; if (pv_entry_count > pv_entry_high_water) pagedaemon_wakeup(0); /* XXX powerpc NUMA */ pv = uma_zalloc(pvzone, M_NOWAIT); return (pv); } /* Free pv_entry structure. */ static __inline void pv_free(pv_entry_t pve) { pv_entry_count--; uma_zfree(pvzone, pve); } /* Allocate and initialize pv_entry structure. */ static void pv_insert(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pve; //int su = (pmap == kernel_pmap); //debugf("pv_insert: s (su = %d pmap = 0x%08x va = 0x%08x m = 0x%08x)\n", su, // (u_int32_t)pmap, va, (u_int32_t)m); pve = pv_alloc(); if (pve == NULL) panic("pv_insert: no pv entries!"); pve->pv_pmap = pmap; pve->pv_va = va; /* add to pv_list */ PMAP_LOCK_ASSERT(pmap, MA_OWNED); rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_INSERT_TAIL(&m->md.pv_list, pve, pv_link); //debugf("pv_insert: e\n"); } /* Destroy pv entry. */ static void pv_remove(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pve; //int su = (pmap == kernel_pmap); //debugf("pv_remove: s (su = %d pmap = 0x%08x va = 0x%08x)\n", su, (u_int32_t)pmap, va); PMAP_LOCK_ASSERT(pmap, MA_OWNED); rw_assert(&pvh_global_lock, RA_WLOCKED); /* find pv entry */ TAILQ_FOREACH(pve, &m->md.pv_list, pv_link) { if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) { /* remove from pv_list */ TAILQ_REMOVE(&m->md.pv_list, pve, pv_link); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); /* free pv entry struct */ pv_free(pve); break; } } //debugf("pv_remove: e\n"); } #ifdef __powerpc64__ /* * Clean pte entry, try to free page table page if requested. * * Return 1 if ptbl pages were freed, otherwise return 0. */ static int pte_remove(mmu_t mmu, pmap_t pmap, vm_offset_t va, u_int8_t flags) { vm_page_t m; pte_t *pte; pte = pte_find(mmu, pmap, va); KASSERT(pte != NULL, ("%s: NULL pte", __func__)); if (!PTE_ISVALID(pte)) return (0); /* Get vm_page_t for mapped pte. */ m = PHYS_TO_VM_PAGE(PTE_PA(pte)); if (PTE_ISWIRED(pte)) pmap->pm_stats.wired_count--; /* Handle managed entry. */ if (PTE_ISMANAGED(pte)) { /* Handle modified pages. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); /* Referenced pages. */ if (PTE_ISREFERENCED(pte)) vm_page_aflag_set(m, PGA_REFERENCED); /* Remove pv_entry from pv_list. */ pv_remove(pmap, va, m); } else if (m->md.pv_tracked) { pv_remove(pmap, va, m); if (TAILQ_EMPTY(&m->md.pv_list)) m->md.pv_tracked = false; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); pmap->pm_stats.resident_count--; if (flags & PTBL_UNHOLD) { return (ptbl_unhold(mmu, pmap, va)); } return (0); } /* * Insert PTE for a given page and virtual address. */ static int pte_enter(mmu_t mmu, pmap_t pmap, vm_page_t m, vm_offset_t va, uint32_t flags, boolean_t nosleep) { unsigned int pp2d_idx = PP2D_IDX(va); unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); pte_t *ptbl, *pte, pte_tmp; pte_t **pdir; /* Get the page directory pointer. */ pdir = pmap->pm_pp2d[pp2d_idx]; if (pdir == NULL) pdir = pdir_alloc(mmu, pmap, pp2d_idx, nosleep); /* Get the page table pointer. */ ptbl = pdir[pdir_idx]; if (ptbl == NULL) { /* Allocate page table pages. */ ptbl = ptbl_alloc(mmu, pmap, pdir, pdir_idx, nosleep); if (ptbl == NULL) { KASSERT(nosleep, ("nosleep and NULL ptbl")); return (ENOMEM); } pte = &ptbl[ptbl_idx]; } else { /* * Check if there is valid mapping for requested va, if there * is, remove it. */ pte = &ptbl[ptbl_idx]; if (PTE_ISVALID(pte)) { pte_remove(mmu, pmap, va, PTBL_HOLD); } else { /* * pte is not used, increment hold count for ptbl * pages. */ if (pmap != kernel_pmap) ptbl_hold(mmu, pmap, pdir, pdir_idx); } } if (pdir[pdir_idx] == NULL) { if (pmap != kernel_pmap && pmap->pm_pp2d[pp2d_idx] != NULL) pdir_hold(mmu, pmap, pdir); pdir[pdir_idx] = ptbl; } if (pmap->pm_pp2d[pp2d_idx] == NULL) pmap->pm_pp2d[pp2d_idx] = pdir; /* * Insert pv_entry into pv_list for mapped page if part of managed * memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { flags |= PTE_MANAGED; /* Create and insert pv entry. */ pv_insert(pmap, va, m); } pmap->pm_stats.resident_count++; pte_tmp = PTE_RPN_FROM_PA(VM_PAGE_TO_PHYS(m)); pte_tmp |= (PTE_VALID | flags); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = pte_tmp; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } /* Return the pa for the given pmap/va. */ static vm_paddr_t pte_vatopa(mmu_t mmu, pmap_t pmap, vm_offset_t va) { vm_paddr_t pa = 0; pte_t *pte; pte = pte_find(mmu, pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) pa = (PTE_PA(pte) | (va & PTE_PA_MASK)); return (pa); } /* allocate pte entries to manage (addr & mask) to (addr & mask) + size */ static void kernel_pte_alloc(vm_offset_t data_end, vm_offset_t addr, vm_offset_t pdir) { int i, j; vm_offset_t va; pte_t *pte; va = addr; /* Initialize kernel pdir */ for (i = 0; i < kernel_pdirs; i++) { kernel_pmap->pm_pp2d[i + PP2D_IDX(va)] = (pte_t **)(pdir + (i * PAGE_SIZE * PDIR_PAGES)); for (j = PDIR_IDX(va + (i * PAGE_SIZE * PDIR_NENTRIES * PTBL_NENTRIES)); j < PDIR_NENTRIES; j++) { kernel_pmap->pm_pp2d[i + PP2D_IDX(va)][j] = (pte_t *)(pdir + (kernel_pdirs * PAGE_SIZE) + (((i * PDIR_NENTRIES) + j) * PAGE_SIZE)); } } /* * Fill in PTEs covering kernel code and data. They are not required * for address translation, as this area is covered by static TLB1 * entries, but for pte_vatopa() to work correctly with kernel area * addresses. */ for (va = addr; va < data_end; va += PAGE_SIZE) { pte = &(kernel_pmap->pm_pp2d[PP2D_IDX(va)][PDIR_IDX(va)][PTBL_IDX(va)]); *pte = PTE_RPN_FROM_PA(kernload + (va - kernstart)); *pte |= PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID | PTE_PS_4KB; } } #else /* * Clean pte entry, try to free page table page if requested. * * Return 1 if ptbl pages were freed, otherwise return 0. */ static int pte_remove(mmu_t mmu, pmap_t pmap, vm_offset_t va, uint8_t flags) { unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); vm_page_t m; pte_t *ptbl; pte_t *pte; //int su = (pmap == kernel_pmap); //debugf("pte_remove: s (su = %d pmap = 0x%08x va = 0x%08x flags = %d)\n", // su, (u_int32_t)pmap, va, flags); ptbl = pmap->pm_pdir[pdir_idx]; KASSERT(ptbl, ("pte_remove: null ptbl")); pte = &ptbl[ptbl_idx]; if (pte == NULL || !PTE_ISVALID(pte)) return (0); if (PTE_ISWIRED(pte)) pmap->pm_stats.wired_count--; /* Get vm_page_t for mapped pte. */ m = PHYS_TO_VM_PAGE(PTE_PA(pte)); /* Handle managed entry. */ if (PTE_ISMANAGED(pte)) { if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); if (PTE_ISREFERENCED(pte)) vm_page_aflag_set(m, PGA_REFERENCED); pv_remove(pmap, va, m); } else if (m->md.pv_tracked) { /* * Always pv_insert()/pv_remove() on MPC85XX, in case DPAA is * used. This is needed by the NCSW support code for fast * VA<->PA translation. */ pv_remove(pmap, va, m); if (TAILQ_EMPTY(&m->md.pv_list)) m->md.pv_tracked = false; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); pmap->pm_stats.resident_count--; if (flags & PTBL_UNHOLD) { //debugf("pte_remove: e (unhold)\n"); return (ptbl_unhold(mmu, pmap, pdir_idx)); } //debugf("pte_remove: e\n"); return (0); } /* * Insert PTE for a given page and virtual address. */ static int pte_enter(mmu_t mmu, pmap_t pmap, vm_page_t m, vm_offset_t va, uint32_t flags, boolean_t nosleep) { unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); pte_t *ptbl, *pte, pte_tmp; CTR4(KTR_PMAP, "%s: su = %d pmap = %p va = %p", __func__, pmap == kernel_pmap, pmap, va); /* Get the page table pointer. */ ptbl = pmap->pm_pdir[pdir_idx]; if (ptbl == NULL) { /* Allocate page table pages. */ ptbl = ptbl_alloc(mmu, pmap, pdir_idx, nosleep); if (ptbl == NULL) { KASSERT(nosleep, ("nosleep and NULL ptbl")); return (ENOMEM); } pmap->pm_pdir[pdir_idx] = ptbl; pte = &ptbl[ptbl_idx]; } else { /* * Check if there is valid mapping for requested * va, if there is, remove it. */ pte = &pmap->pm_pdir[pdir_idx][ptbl_idx]; if (PTE_ISVALID(pte)) { pte_remove(mmu, pmap, va, PTBL_HOLD); } else { /* * pte is not used, increment hold count * for ptbl pages. */ if (pmap != kernel_pmap) ptbl_hold(mmu, pmap, pdir_idx); } } /* * Insert pv_entry into pv_list for mapped page if part of managed * memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { flags |= PTE_MANAGED; /* Create and insert pv entry. */ pv_insert(pmap, va, m); } pmap->pm_stats.resident_count++; pte_tmp = PTE_RPN_FROM_PA(VM_PAGE_TO_PHYS(m)); pte_tmp |= (PTE_VALID | flags | PTE_PS_4KB); /* 4KB pages only */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte = pte_tmp; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } /* Return the pa for the given pmap/va. */ static vm_paddr_t pte_vatopa(mmu_t mmu, pmap_t pmap, vm_offset_t va) { vm_paddr_t pa = 0; pte_t *pte; pte = pte_find(mmu, pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) pa = (PTE_PA(pte) | (va & PTE_PA_MASK)); return (pa); } /* Get a pointer to a PTE in a page table. */ static pte_t * pte_find(mmu_t mmu, pmap_t pmap, vm_offset_t va) { unsigned int pdir_idx = PDIR_IDX(va); unsigned int ptbl_idx = PTBL_IDX(va); KASSERT((pmap != NULL), ("pte_find: invalid pmap")); if (pmap->pm_pdir[pdir_idx]) return (&(pmap->pm_pdir[pdir_idx][ptbl_idx])); return (NULL); } /* Set up kernel page tables. */ static void kernel_pte_alloc(vm_offset_t data_end, vm_offset_t addr, vm_offset_t pdir) { int i; vm_offset_t va; pte_t *pte; /* Initialize kernel pdir */ for (i = 0; i < kernel_ptbls; i++) kernel_pmap->pm_pdir[kptbl_min + i] = (pte_t *)(pdir + (i * PAGE_SIZE * PTBL_PAGES)); /* * Fill in PTEs covering kernel code and data. They are not required * for address translation, as this area is covered by static TLB1 * entries, but for pte_vatopa() to work correctly with kernel area * addresses. */ for (va = addr; va < data_end; va += PAGE_SIZE) { pte = &(kernel_pmap->pm_pdir[PDIR_IDX(va)][PTBL_IDX(va)]); *pte = PTE_RPN_FROM_PA(kernload + (va - kernstart)); *pte |= PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID | PTE_PS_4KB; } } #endif /**************************************************************************/ /* PMAP related */ /**************************************************************************/ /* * This is called during booke_init, before the system is really initialized. */ static void mmu_booke_bootstrap(mmu_t mmu, vm_offset_t start, vm_offset_t kernelend) { vm_paddr_t phys_kernelend; struct mem_region *mp, *mp1; int cnt, i, j; vm_paddr_t s, e, sz; vm_paddr_t physsz, hwphyssz; u_int phys_avail_count; vm_size_t kstack0_sz; vm_offset_t kernel_pdir, kstack0; vm_paddr_t kstack0_phys; void *dpcpu; vm_offset_t kernel_ptbl_root; debugf("mmu_booke_bootstrap: entered\n"); /* Set interesting system properties */ #ifdef __powerpc64__ hw_direct_map = 1; #else hw_direct_map = 0; #endif #if defined(COMPAT_FREEBSD32) || !defined(__powerpc64__) elf32_nxstack = 1; #endif /* Initialize invalidation mutex */ mtx_init(&tlbivax_mutex, "tlbivax", NULL, MTX_SPIN); /* Read TLB0 size and associativity. */ tlb0_get_tlbconf(); /* * Align kernel start and end address (kernel image). * Note that kernel end does not necessarily relate to kernsize. * kernsize is the size of the kernel that is actually mapped. */ data_start = round_page(kernelend); data_end = data_start; /* Allocate the dynamic per-cpu area. */ dpcpu = (void *)data_end; data_end += DPCPU_SIZE; /* Allocate space for the message buffer. */ msgbufp = (struct msgbuf *)data_end; data_end += msgbufsize; debugf(" msgbufp at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", (uintptr_t)msgbufp, data_end); data_end = round_page(data_end); #ifdef __powerpc64__ kernel_ptbl_root = data_end; data_end += PP2D_NENTRIES * sizeof(pte_t**); #else /* Allocate space for ptbl_bufs. */ ptbl_bufs = (struct ptbl_buf *)data_end; data_end += sizeof(struct ptbl_buf) * PTBL_BUFS; debugf(" ptbl_bufs at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", (uintptr_t)ptbl_bufs, data_end); data_end = round_page(data_end); kernel_ptbl_root = data_end; data_end += PDIR_NENTRIES * sizeof(pte_t*); #endif /* Allocate PTE tables for kernel KVA. */ kernel_pdir = data_end; kernel_ptbls = howmany(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, PDIR_SIZE); #ifdef __powerpc64__ kernel_pdirs = howmany(kernel_ptbls, PDIR_NENTRIES); data_end += kernel_pdirs * PDIR_PAGES * PAGE_SIZE; #endif data_end += kernel_ptbls * PTBL_PAGES * PAGE_SIZE; debugf(" kernel ptbls: %d\n", kernel_ptbls); debugf(" kernel pdir at 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", kernel_pdir, data_end); debugf(" data_end: 0x%"PRI0ptrX"\n", data_end); if (data_end - kernstart > kernsize) { kernsize += tlb1_mapin_region(kernstart + kernsize, kernload + kernsize, (data_end - kernstart) - kernsize); } data_end = kernstart + kernsize; debugf(" updated data_end: 0x%"PRI0ptrX"\n", data_end); /* * Clear the structures - note we can only do it safely after the * possible additional TLB1 translations are in place (above) so that * all range up to the currently calculated 'data_end' is covered. */ dpcpu_init(dpcpu, 0); #ifdef __powerpc64__ memset((void *)kernel_pdir, 0, kernel_pdirs * PDIR_PAGES * PAGE_SIZE + kernel_ptbls * PTBL_PAGES * PAGE_SIZE); #else memset((void *)ptbl_bufs, 0, sizeof(struct ptbl_buf) * PTBL_SIZE); memset((void *)kernel_pdir, 0, kernel_ptbls * PTBL_PAGES * PAGE_SIZE); #endif /*******************************************************/ /* Set the start and end of kva. */ /*******************************************************/ virtual_avail = round_page(data_end); virtual_end = VM_MAX_KERNEL_ADDRESS; #ifndef __powerpc64__ /* Allocate KVA space for page zero/copy operations. */ zero_page_va = virtual_avail; virtual_avail += PAGE_SIZE; copy_page_src_va = virtual_avail; virtual_avail += PAGE_SIZE; copy_page_dst_va = virtual_avail; virtual_avail += PAGE_SIZE; debugf("zero_page_va = 0x%"PRI0ptrX"\n", zero_page_va); debugf("copy_page_src_va = 0x%"PRI0ptrX"\n", copy_page_src_va); debugf("copy_page_dst_va = 0x%"PRI0ptrX"\n", copy_page_dst_va); /* Initialize page zero/copy mutexes. */ mtx_init(&zero_page_mutex, "mmu_booke_zero_page", NULL, MTX_DEF); mtx_init(©_page_mutex, "mmu_booke_copy_page", NULL, MTX_DEF); /* Allocate KVA space for ptbl bufs. */ ptbl_buf_pool_vabase = virtual_avail; virtual_avail += PTBL_BUFS * PTBL_PAGES * PAGE_SIZE; debugf("ptbl_buf_pool_vabase = 0x%"PRI0ptrX" end = 0x%"PRI0ptrX"\n", ptbl_buf_pool_vabase, virtual_avail); #endif /* Calculate corresponding physical addresses for the kernel region. */ phys_kernelend = kernload + kernsize; debugf("kernel image and allocated data:\n"); debugf(" kernload = 0x%09llx\n", (uint64_t)kernload); debugf(" kernstart = 0x%"PRI0ptrX"\n", kernstart); debugf(" kernsize = 0x%"PRI0ptrX"\n", kernsize); /* * Remove kernel physical address range from avail regions list. Page * align all regions. Non-page aligned memory isn't very interesting * to us. Also, sort the entries for ascending addresses. */ /* Retrieve phys/avail mem regions */ mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions, &availmem_regions_sz); if (PHYS_AVAIL_ENTRIES < availmem_regions_sz) panic("mmu_booke_bootstrap: phys_avail too small"); sz = 0; cnt = availmem_regions_sz; debugf("processing avail regions:\n"); for (mp = availmem_regions; mp->mr_size; mp++) { s = mp->mr_start; e = mp->mr_start + mp->mr_size; debugf(" %09jx-%09jx -> ", (uintmax_t)s, (uintmax_t)e); /* Check whether this region holds all of the kernel. */ if (s < kernload && e > phys_kernelend) { availmem_regions[cnt].mr_start = phys_kernelend; availmem_regions[cnt++].mr_size = e - phys_kernelend; e = kernload; } /* Look whether this regions starts within the kernel. */ if (s >= kernload && s < phys_kernelend) { if (e <= phys_kernelend) goto empty; s = phys_kernelend; } /* Now look whether this region ends within the kernel. */ if (e > kernload && e <= phys_kernelend) { if (s >= kernload) goto empty; e = kernload; } /* Now page align the start and size of the region. */ s = round_page(s); e = trunc_page(e); if (e < s) e = s; sz = e - s; debugf("%09jx-%09jx = %jx\n", (uintmax_t)s, (uintmax_t)e, (uintmax_t)sz); /* Check whether some memory is left here. */ if (sz == 0) { empty: memmove(mp, mp + 1, (cnt - (mp - availmem_regions)) * sizeof(*mp)); cnt--; mp--; continue; } /* Do an insertion sort. */ for (mp1 = availmem_regions; mp1 < mp; mp1++) if (s < mp1->mr_start) break; if (mp1 < mp) { memmove(mp1 + 1, mp1, (char *)mp - (char *)mp1); mp1->mr_start = s; mp1->mr_size = sz; } else { mp->mr_start = s; mp->mr_size = sz; } } availmem_regions_sz = cnt; /*******************************************************/ /* Steal physical memory for kernel stack from the end */ /* of the first avail region */ /*******************************************************/ kstack0_sz = kstack_pages * PAGE_SIZE; kstack0_phys = availmem_regions[0].mr_start + availmem_regions[0].mr_size; kstack0_phys -= kstack0_sz; availmem_regions[0].mr_size -= kstack0_sz; /*******************************************************/ /* Fill in phys_avail table, based on availmem_regions */ /*******************************************************/ phys_avail_count = 0; physsz = 0; hwphyssz = 0; TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz); debugf("fill in phys_avail:\n"); for (i = 0, j = 0; i < availmem_regions_sz; i++, j += 2) { debugf(" region: 0x%jx - 0x%jx (0x%jx)\n", (uintmax_t)availmem_regions[i].mr_start, (uintmax_t)availmem_regions[i].mr_start + availmem_regions[i].mr_size, (uintmax_t)availmem_regions[i].mr_size); if (hwphyssz != 0 && (physsz + availmem_regions[i].mr_size) >= hwphyssz) { debugf(" hw.physmem adjust\n"); if (physsz < hwphyssz) { phys_avail[j] = availmem_regions[i].mr_start; phys_avail[j + 1] = availmem_regions[i].mr_start + hwphyssz - physsz; physsz = hwphyssz; phys_avail_count++; } break; } phys_avail[j] = availmem_regions[i].mr_start; phys_avail[j + 1] = availmem_regions[i].mr_start + availmem_regions[i].mr_size; phys_avail_count++; physsz += availmem_regions[i].mr_size; } physmem = btoc(physsz); /* Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = powerpc_btop(phys_avail[i + 1]); debugf("Maxmem = 0x%08lx\n", Maxmem); debugf("phys_avail_count = %d\n", phys_avail_count); debugf("physsz = 0x%09jx physmem = %jd (0x%09jx)\n", (uintmax_t)physsz, (uintmax_t)physmem, (uintmax_t)physmem); #ifdef __powerpc64__ /* * Map the physical memory contiguously in TLB1. * Round so it fits into a single mapping. */ tlb1_mapin_region(DMAP_BASE_ADDRESS, 0, phys_avail[i + 1]); #endif /*******************************************************/ /* Initialize (statically allocated) kernel pmap. */ /*******************************************************/ PMAP_LOCK_INIT(kernel_pmap); #ifdef __powerpc64__ kernel_pmap->pm_pp2d = (pte_t ***)kernel_ptbl_root; #else kptbl_min = VM_MIN_KERNEL_ADDRESS / PDIR_SIZE; kernel_pmap->pm_pdir = (pte_t **)kernel_ptbl_root; #endif debugf("kernel_pmap = 0x%"PRI0ptrX"\n", (uintptr_t)kernel_pmap); kernel_pte_alloc(virtual_avail, kernstart, kernel_pdir); for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_tid[i] = TID_KERNEL; /* Initialize each CPU's tidbusy entry 0 with kernel_pmap */ tidbusy[i][TID_KERNEL] = kernel_pmap; } /* Mark kernel_pmap active on all CPUs */ CPU_FILL(&kernel_pmap->pm_active); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /*******************************************************/ /* Final setup */ /*******************************************************/ /* Enter kstack0 into kernel map, provide guard page */ kstack0 = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE; thread0.td_kstack = kstack0; thread0.td_kstack_pages = kstack_pages; debugf("kstack_sz = 0x%08x\n", kstack0_sz); debugf("kstack0_phys at 0x%09llx - 0x%09llx\n", kstack0_phys, kstack0_phys + kstack0_sz); debugf("kstack0 at 0x%"PRI0ptrX" - 0x%"PRI0ptrX"\n", kstack0, kstack0 + kstack0_sz); virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE + kstack0_sz; for (i = 0; i < kstack_pages; i++) { mmu_booke_kenter(mmu, kstack0, kstack0_phys); kstack0 += PAGE_SIZE; kstack0_phys += PAGE_SIZE; } pmap_bootstrapped = 1; debugf("virtual_avail = %"PRI0ptrX"\n", virtual_avail); debugf("virtual_end = %"PRI0ptrX"\n", virtual_end); debugf("mmu_booke_bootstrap: exit\n"); } #ifdef SMP void tlb1_ap_prep(void) { tlb_entry_t *e, tmp; unsigned int i; /* Prepare TLB1 image for AP processors */ e = __boot_tlb1; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&tmp, i); if ((tmp.mas1 & MAS1_VALID) && (tmp.mas2 & _TLB_ENTRY_SHARED)) memcpy(e++, &tmp, sizeof(tmp)); } } void pmap_bootstrap_ap(volatile uint32_t *trcp __unused) { int i; /* * Finish TLB1 configuration: the BSP already set up its TLB1 and we * have the snapshot of its contents in the s/w __boot_tlb1[] table * created by tlb1_ap_prep(), so use these values directly to * (re)program AP's TLB1 hardware. * * Start at index 1 because index 0 has the kernel map. */ for (i = 1; i < TLB1_ENTRIES; i++) { if (__boot_tlb1[i].mas1 & MAS1_VALID) tlb1_write_entry(&__boot_tlb1[i], i); } set_mas4_defaults(); } #endif static void booke_pmap_init_qpages(void) { struct pcpu *pc; int i; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, booke_pmap_init_qpages, NULL); /* * Get the physical page address for the given pmap/virtual address. */ static vm_paddr_t mmu_booke_extract(mmu_t mmu, pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; PMAP_LOCK(pmap); pa = pte_vatopa(mmu, pmap, va); PMAP_UNLOCK(pmap); return (pa); } /* * Extract the physical page address associated with the given * kernel virtual address. */ static vm_paddr_t mmu_booke_kextract(mmu_t mmu, vm_offset_t va) { tlb_entry_t e; vm_paddr_t p = 0; int i; #ifdef __powerpc64__ if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); #endif if (va >= VM_MIN_KERNEL_ADDRESS && va <= VM_MAX_KERNEL_ADDRESS) p = pte_vatopa(mmu, kernel_pmap, va); if (p == 0) { /* Check TLB1 mappings */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (va >= e.virt && va < e.virt + e.size) return (e.phys + (va - e.virt)); } } return (p); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ static void mmu_booke_init(mmu_t mmu) { int shpgperproc = PMAP_SHPGPERPROC; /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_reserve_kva(pvzone, pv_entry_max); /* Pre-fill pvzone with initial number of pv entries. */ uma_prealloc(pvzone, PV_ENTRY_ZONE_MIN); /* Create a UMA zone for page table roots. */ ptbl_root_zone = uma_zcreate("pmap root", PMAP_ROOT_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, UMA_ZONE_VM); /* Initialize ptbl allocation. */ ptbl_init(); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ static void mmu_booke_qenter(mmu_t mmu, vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by mmu_booke_qenter. */ static void mmu_booke_qremove(mmu_t mmu, vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { mmu_booke_kremove(mmu, va); va += PAGE_SIZE; } } /* * Map a wired page into kernel virtual address space. */ static void mmu_booke_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa) { mmu_booke_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT); } static void mmu_booke_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { uint32_t flags; pte_t *pte; KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kenter: invalid va")); flags = PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID; flags |= tlb_calc_wimg(pa, ma) << PTE_MAS2_SHIFT; flags |= PTE_PS_4KB; pte = pte_find(mmu, kernel_pmap, va); KASSERT((pte != NULL), ("mmu_booke_kenter: invalid va. NULL PTE")); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); if (PTE_ISVALID(pte)) { CTR1(KTR_PMAP, "%s: replacing entry!", __func__); /* Flush entry from TLB0 */ tlb0_flush_entry(va); } *pte = PTE_RPN_FROM_PA(pa) | flags; //debugf("mmu_booke_kenter: pdir_idx = %d ptbl_idx = %d va=0x%08x " // "pa=0x%08x rpn=0x%08x flags=0x%08x\n", // pdir_idx, ptbl_idx, va, pa, pte->rpn, pte->flags); /* Flush the real memory from the instruction cache. */ if ((flags & (PTE_I | PTE_G)) == 0) __syncicache((void *)va, PAGE_SIZE); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } /* * Remove a page from kernel page table. */ static void mmu_booke_kremove(mmu_t mmu, vm_offset_t va) { pte_t *pte; CTR2(KTR_PMAP,"%s: s (va = 0x%"PRI0ptrX")\n", __func__, va); KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kremove: invalid va")); pte = pte_find(mmu, kernel_pmap, va); if (!PTE_ISVALID(pte)) { CTR1(KTR_PMAP, "%s: invalid pte", __func__); return; } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Invalidate entry in TLB0, update PTE. */ tlb0_flush_entry(va); *pte = 0; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } /* * Provide a kernel pointer corresponding to a given userland pointer. * The returned pointer is valid until the next time this function is * called in this thread. This is used internally in copyin/copyout. */ int mmu_booke_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen) { if (trunc_page((uintptr_t)uaddr + ulen) > VM_MAXUSER_ADDRESS) return (EFAULT); *kaddr = (void *)(uintptr_t)uaddr; if (klen) *klen = ulen; return (0); } /* * Figure out where a given kernel pointer (usually in a fault) points * to from the VM's perspective, potentially remapping into userland's * address space. */ static int mmu_booke_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user, vm_offset_t *decoded_addr) { if (trunc_page(addr) <= VM_MAXUSER_ADDRESS) *is_user = 1; else *is_user = 0; *decoded_addr = addr; return (0); } /* * Initialize pmap associated with process 0. */ static void mmu_booke_pinit0(mmu_t mmu, pmap_t pmap) { PMAP_LOCK_INIT(pmap); mmu_booke_pinit(mmu, pmap); PCPU_SET(curpmap, pmap); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ static void mmu_booke_pinit(mmu_t mmu, pmap_t pmap) { int i; CTR4(KTR_PMAP, "%s: pmap = %p, proc %d '%s'", __func__, pmap, curthread->td_proc->p_pid, curthread->td_proc->p_comm); KASSERT((pmap != kernel_pmap), ("pmap_pinit: initializing kernel_pmap")); for (i = 0; i < MAXCPU; i++) pmap->pm_tid[i] = TID_NONE; CPU_ZERO(&kernel_pmap->pm_active); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); #ifdef __powerpc64__ pmap->pm_pp2d = uma_zalloc(ptbl_root_zone, M_WAITOK); bzero(pmap->pm_pp2d, sizeof(pte_t **) * PP2D_NENTRIES); #else pmap->pm_pdir = uma_zalloc(ptbl_root_zone, M_WAITOK); bzero(pmap->pm_pdir, sizeof(pte_t *) * PDIR_NENTRIES); TAILQ_INIT(&pmap->pm_ptbl_list); #endif } /* * Release any resources held by the given physical map. * Called when a pmap initialized by mmu_booke_pinit is being released. * Should only be called if the map contains no valid mappings. */ static void mmu_booke_release(mmu_t mmu, pmap_t pmap) { KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); #ifdef __powerpc64__ uma_zfree(ptbl_root_zone, pmap->pm_pp2d); #else uma_zfree(ptbl_root_zone, pmap->pm_pdir); #endif } /* * Insert the given physical page at the specified virtual address in the * target physical map with the protection requested. If specified the page * will be wired down. */ static int mmu_booke_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { int error; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); error = mmu_booke_enter_locked(mmu, pmap, va, m, prot, flags, psind); PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); return (error); } static int mmu_booke_enter_locked(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int pmap_flags, int8_t psind __unused) { pte_t *pte; vm_paddr_t pa; uint32_t flags; int error, su, sync; pa = VM_PAGE_TO_PHYS(m); su = (pmap == kernel_pmap); sync = 0; //debugf("mmu_booke_enter_locked: s (pmap=0x%08x su=%d tid=%d m=0x%08x va=0x%08x " // "pa=0x%08x prot=0x%08x flags=%#x)\n", // (u_int32_t)pmap, su, pmap->pm_tid, // (u_int32_t)m, va, pa, prot, flags); if (su) { KASSERT(((va >= virtual_avail) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_enter_locked: kernel pmap, non kernel va")); } else { KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_enter_locked: user pmap, non user va")); } if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * If there is an existing mapping, and the physical address has not * changed, must be protection or wiring change. */ if (((pte = pte_find(mmu, pmap, va)) != NULL) && (PTE_ISVALID(pte)) && (PTE_PA(pte) == pa)) { /* * Before actually updating pte->flags we calculate and * prepare its new value in a helper var. */ flags = *pte; flags &= ~(PTE_UW | PTE_UX | PTE_SW | PTE_SX | PTE_MODIFIED); /* Wiring change, just update stats. */ if ((pmap_flags & PMAP_ENTER_WIRED) != 0) { if (!PTE_ISWIRED(pte)) { flags |= PTE_WIRED; pmap->pm_stats.wired_count++; } } else { if (PTE_ISWIRED(pte)) { flags &= ~PTE_WIRED; pmap->pm_stats.wired_count--; } } if (prot & VM_PROT_WRITE) { /* Add write permissions. */ flags |= PTE_SW; if (!su) flags |= PTE_UW; if ((flags & PTE_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else { /* Handle modified pages, sense modify status. */ /* * The PTE_MODIFIED flag could be set by underlying * TLB misses since we last read it (above), possibly * other CPUs could update it so we check in the PTE * directly rather than rely on that saved local flags * copy. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); } if (prot & VM_PROT_EXECUTE) { flags |= PTE_SX; if (!su) flags |= PTE_UX; /* * Check existing flags for execute permissions: if we * are turning execute permissions on, icache should * be flushed. */ if ((*pte & (PTE_UX | PTE_SX)) == 0) sync++; } flags &= ~PTE_REFERENCED; /* * The new flags value is all calculated -- only now actually * update the PTE. */ mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(va); *pte &= ~PTE_FLAGS_MASK; *pte |= flags; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } else { /* * If there is an existing mapping, but it's for a different * physical address, pte_enter() will delete the old mapping. */ //if ((pte != NULL) && PTE_ISVALID(pte)) // debugf("mmu_booke_enter_locked: replace\n"); //else // debugf("mmu_booke_enter_locked: new\n"); /* Now set up the flags and install the new mapping. */ flags = (PTE_SR | PTE_VALID); flags |= PTE_M; if (!su) flags |= PTE_UR; if (prot & VM_PROT_WRITE) { flags |= PTE_SW; if (!su) flags |= PTE_UW; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } if (prot & VM_PROT_EXECUTE) { flags |= PTE_SX; if (!su) flags |= PTE_UX; } /* If its wired update stats. */ if ((pmap_flags & PMAP_ENTER_WIRED) != 0) flags |= PTE_WIRED; error = pte_enter(mmu, pmap, m, va, flags, (pmap_flags & PMAP_ENTER_NOSLEEP) != 0); if (error != 0) return (KERN_RESOURCE_SHORTAGE); if ((flags & PMAP_ENTER_WIRED) != 0) pmap->pm_stats.wired_count++; /* Flush the real memory from the instruction cache. */ if (prot & VM_PROT_EXECUTE) sync++; } if (sync && (su || pmap == PCPU_GET(curpmap))) { __syncicache((void *)va, PAGE_SIZE); sync = 0; } return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ static void mmu_booke_enter_object(mmu_t mmu, pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { mmu_booke_enter_locked(mmu, pmap, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static void mmu_booke_enter_quick(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); mmu_booke_enter_locked(mmu, pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly rounded to the page size. */ static void mmu_booke_remove(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_offset_t endva) { pte_t *pte; uint8_t hold_flag; int su = (pmap == kernel_pmap); //debugf("mmu_booke_remove: s (su = %d pmap=0x%08x tid=%d va=0x%08x endva=0x%08x)\n", // su, (u_int32_t)pmap, pmap->pm_tid, va, endva); if (su) { KASSERT(((va >= virtual_avail) && (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_remove: kernel pmap, non kernel va")); } else { KASSERT((va <= VM_MAXUSER_ADDRESS), ("mmu_booke_remove: user pmap, non user va")); } if (PMAP_REMOVE_DONE(pmap)) { //debugf("mmu_booke_remove: e (empty)\n"); return; } hold_flag = PTBL_HOLD_FLAG(pmap); //debugf("mmu_booke_remove: hold_flag = %d\n", hold_flag); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); for (; va < endva; va += PAGE_SIZE) { pte = pte_find(mmu, pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) pte_remove(mmu, pmap, va, hold_flag); } PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); //debugf("mmu_booke_remove: e\n"); } /* * Remove physical page from all pmaps in which it resides. */ static void mmu_booke_remove_all(mmu_t mmu, vm_page_t m) { pv_entry_t pv, pvn; uint8_t hold_flag; rw_wlock(&pvh_global_lock); for (pv = TAILQ_FIRST(&m->md.pv_list); pv != NULL; pv = pvn) { pvn = TAILQ_NEXT(pv, pv_link); PMAP_LOCK(pv->pv_pmap); hold_flag = PTBL_HOLD_FLAG(pv->pv_pmap); pte_remove(mmu, pv->pv_pmap, pv->pv_va, hold_flag); PMAP_UNLOCK(pv->pv_pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } /* * Map a range of physical addresses into kernel virtual address space. */ static vm_offset_t mmu_booke_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start, vm_paddr_t pa_end, int prot) { vm_offset_t sva = *virt; vm_offset_t va = sva; #ifdef __powerpc64__ /* XXX: Handle memory not starting at 0x0. */ if (pa_end < ctob(Maxmem)) return (PHYS_TO_DMAP(pa_start)); #endif while (pa_start < pa_end) { mmu_booke_kenter(mmu, va, pa_start); va += PAGE_SIZE; pa_start += PAGE_SIZE; } *virt = va; return (sva); } /* * The pmap must be activated before it's address space can be accessed in any * way. */ static void mmu_booke_activate(mmu_t mmu, struct thread *td) { pmap_t pmap; u_int cpuid; pmap = &td->td_proc->p_vmspace->vm_pmap; CTR5(KTR_PMAP, "%s: s (td = %p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX")", __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap); KASSERT((pmap != kernel_pmap), ("mmu_booke_activate: kernel_pmap!")); sched_pin(); cpuid = PCPU_GET(cpuid); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); PCPU_SET(curpmap, pmap); if (pmap->pm_tid[cpuid] == TID_NONE) tid_alloc(pmap); /* Load PID0 register with pmap tid value. */ mtspr(SPR_PID0, pmap->pm_tid[cpuid]); __asm __volatile("isync"); mtspr(SPR_DBCR0, td->td_pcb->pcb_cpu.booke.dbcr0); sched_unpin(); CTR3(KTR_PMAP, "%s: e (tid = %d for '%s')", __func__, pmap->pm_tid[PCPU_GET(cpuid)], td->td_proc->p_comm); } /* * Deactivate the specified process's address space. */ static void mmu_booke_deactivate(mmu_t mmu, struct thread *td) { pmap_t pmap; pmap = &td->td_proc->p_vmspace->vm_pmap; CTR5(KTR_PMAP, "%s: td=%p, proc = '%s', id = %d, pmap = 0x%"PRI0ptrX, __func__, td, td->td_proc->p_comm, td->td_proc->p_pid, pmap); td->td_pcb->pcb_cpu.booke.dbcr0 = mfspr(SPR_DBCR0); CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmap->pm_active); PCPU_SET(curpmap, NULL); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ static void mmu_booke_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * Set the physical protection on the specified range of this map as requested. */ static void mmu_booke_protect(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va; vm_page_t m; pte_t *pte; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { mmu_booke_remove(mmu, pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (va = sva; va < eva; va += PAGE_SIZE) { if ((pte = pte_find(mmu, pmap, va)) != NULL) { if (PTE_ISVALID(pte)) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Handle modified pages. */ if (PTE_ISMODIFIED(pte) && PTE_ISMANAGED(pte)) vm_page_dirty(m); tlb0_flush_entry(va); *pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } } } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ static void mmu_booke_remove_write(mmu_t mmu, vm_page_t m) { pv_entry_t pv; pte_t *pte; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL) { if (PTE_ISVALID(pte)) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); /* Handle modified pages. */ if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); /* Flush mapping from TLB0. */ *pte &= ~(PTE_UW | PTE_SW | PTE_MODIFIED); tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } } PMAP_UNLOCK(pv->pv_pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); } static void mmu_booke_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz) { pte_t *pte; vm_paddr_t pa = 0; int sync_sz, valid; #ifndef __powerpc64__ pmap_t pmap; vm_page_t m; vm_offset_t addr; int active; #endif #ifndef __powerpc64__ rw_wlock(&pvh_global_lock); pmap = PCPU_GET(curpmap); active = (pm == kernel_pmap || pm == pmap) ? 1 : 0; #endif while (sz > 0) { PMAP_LOCK(pm); pte = pte_find(mmu, pm, va); valid = (pte != NULL && PTE_ISVALID(pte)) ? 1 : 0; if (valid) pa = PTE_PA(pte); PMAP_UNLOCK(pm); sync_sz = PAGE_SIZE - (va & PAGE_MASK); sync_sz = min(sync_sz, sz); if (valid) { #ifdef __powerpc64__ pa += (va & PAGE_MASK); __syncicache((void *)PHYS_TO_DMAP(pa), sync_sz); #else if (!active) { /* Create a mapping in the active pmap. */ addr = 0; m = PHYS_TO_VM_PAGE(pa); PMAP_LOCK(pmap); pte_enter(mmu, pmap, m, addr, PTE_SR | PTE_VALID, FALSE); addr += (va & PAGE_MASK); __syncicache((void *)addr, sync_sz); pte_remove(mmu, pmap, addr, PTBL_UNHOLD); PMAP_UNLOCK(pmap); } else __syncicache((void *)va, sync_sz); #endif } va += sync_sz; sz -= sync_sz; } #ifndef __powerpc64__ rw_wunlock(&pvh_global_lock); #endif } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ static vm_page_t mmu_booke_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pte_t *pte; vm_page_t m; uint32_t pte_wbit; m = NULL; PMAP_LOCK(pmap); pte = pte_find(mmu, pmap, va); if ((pte != NULL) && PTE_ISVALID(pte)) { if (pmap == kernel_pmap) pte_wbit = PTE_SW; else pte_wbit = PTE_UW; if ((*pte & pte_wbit) != 0 || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE(PTE_PA(pte)); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } /* * Initialize a vm_page's machine-dependent fields. */ static void mmu_booke_page_init(mmu_t mmu, vm_page_t m) { m->md.pv_tracked = 0; TAILQ_INIT(&m->md.pv_list); } /* * mmu_booke_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ static void mmu_booke_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size) { vm_offset_t va; /* XXX KASSERT off and size are within a single page? */ #ifdef __powerpc64__ va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); bzero((caddr_t)va + off, size); #else mtx_lock(&zero_page_mutex); va = zero_page_va; mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(m)); bzero((caddr_t)va + off, size); mmu_booke_kremove(mmu, va); mtx_unlock(&zero_page_mutex); #endif } /* * mmu_booke_zero_page zeros the specified hardware page. */ static void mmu_booke_zero_page(mmu_t mmu, vm_page_t m) { vm_offset_t off, va; #ifdef __powerpc64__ va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); #else va = zero_page_va; mtx_lock(&zero_page_mutex); mmu_booke_kenter(mmu, va, VM_PAGE_TO_PHYS(m)); for (off = 0; off < PAGE_SIZE; off += cacheline_size) __asm __volatile("dcbz 0,%0" :: "r"(va + off)); mmu_booke_kremove(mmu, va); mtx_unlock(&zero_page_mutex); #endif } /* * mmu_booke_copy_page copies the specified (machine independent) page by * mapping the page into virtual memory and using memcopy to copy the page, * one machine dependent page at a time. */ static void mmu_booke_copy_page(mmu_t mmu, vm_page_t sm, vm_page_t dm) { vm_offset_t sva, dva; #ifdef __powerpc64__ sva = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(sm)); dva = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dm)); memcpy((caddr_t)dva, (caddr_t)sva, PAGE_SIZE); #else sva = copy_page_src_va; dva = copy_page_dst_va; mtx_lock(©_page_mutex); mmu_booke_kenter(mmu, sva, VM_PAGE_TO_PHYS(sm)); mmu_booke_kenter(mmu, dva, VM_PAGE_TO_PHYS(dm)); memcpy((caddr_t)dva, (caddr_t)sva, PAGE_SIZE); mmu_booke_kremove(mmu, dva); mmu_booke_kremove(mmu, sva); mtx_unlock(©_page_mutex); #endif } static inline void mmu_booke_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset, vm_page_t *mb, vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; #ifdef __powerpc64__ vm_page_t pa, pb; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pa = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pb = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); a_cp = (caddr_t)((uintptr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pa)) + a_pg_offset); b_cp = (caddr_t)((uintptr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pb)) + b_pg_offset); bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } #else mtx_lock(©_page_mutex); while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); mmu_booke_kenter(mmu, copy_page_src_va, VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])); a_cp = (char *)copy_page_src_va + a_pg_offset; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); mmu_booke_kenter(mmu, copy_page_dst_va, VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])); b_cp = (char *)copy_page_dst_va + b_pg_offset; bcopy(a_cp, b_cp, cnt); mmu_booke_kremove(mmu, copy_page_dst_va); mmu_booke_kremove(mmu, copy_page_src_va); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } mtx_unlock(©_page_mutex); #endif } static vm_offset_t mmu_booke_quick_enter_page(mmu_t mmu, vm_page_t m) { #ifdef __powerpc64__ return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); #else vm_paddr_t paddr; vm_offset_t qaddr; uint32_t flags; pte_t *pte; paddr = VM_PAGE_TO_PHYS(m); flags = PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID; flags |= tlb_calc_wimg(paddr, pmap_page_get_memattr(m)) << PTE_MAS2_SHIFT; flags |= PTE_PS_4KB; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = pte_find(mmu, kernel_pmap, qaddr); KASSERT(*pte == 0, ("mmu_booke_quick_enter_page: PTE busy")); /* * XXX: tlbivax is broadcast to other cores, but qaddr should * not be present in other TLBs. Is there a better instruction * sequence to use? Or just forget it & use mmu_booke_kenter()... */ __asm __volatile("tlbivax 0, %0" :: "r"(qaddr & MAS2_EPN_MASK)); __asm __volatile("isync; msync"); *pte = PTE_RPN_FROM_PA(paddr) | flags; /* Flush the real memory from the instruction cache. */ if ((flags & (PTE_I | PTE_G)) == 0) __syncicache((void *)qaddr, PAGE_SIZE); return (qaddr); #endif } static void mmu_booke_quick_remove_page(mmu_t mmu, vm_offset_t addr) { #ifndef __powerpc64__ pte_t *pte; pte = pte_find(mmu, kernel_pmap, addr); KASSERT(PCPU_GET(qmap_addr) == addr, ("mmu_booke_quick_remove_page: invalid address")); KASSERT(*pte != 0, ("mmu_booke_quick_remove_page: PTE not in use")); *pte = 0; critical_exit(); #endif } /* * Return whether or not the specified physical page was modified * in any of physical maps. */ static boolean_t mmu_booke_is_modified(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_is_modified: page %p is not managed", m)); rv = FALSE; /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can be modified. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (rv); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISMODIFIED(pte)) rv = TRUE; } PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Return whether or not the specified virtual address is eligible * for prefault. */ static boolean_t mmu_booke_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t addr) { return (FALSE); } /* * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t mmu_booke_is_referenced(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_is_referenced: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISREFERENCED(pte)) rv = TRUE; } PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Clear the modify bits on the specified physical page. */ static void mmu_booke_clear_modify(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("mmu_booke_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PG_AWRITEABLE, then no PTEs can be modified. * If the object containing the page is locked and the page is not * exclusive busied, then PG_AWRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); if (*pte & (PTE_SW | PTE_UW | PTE_MODIFIED)) { tlb0_flush_entry(pv->pv_va); *pte &= ~(PTE_SW | PTE_UW | PTE_MODIFIED | PTE_REFERENCED); } tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); } PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); } /* * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ static int mmu_booke_ts_referenced(mmu_t mmu, vm_page_t m) { pte_t *pte; pv_entry_t pv; int count; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_ts_referenced: page %p is not managed", m)); count = 0; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL && PTE_ISVALID(pte)) { if (PTE_ISMODIFIED(pte)) vm_page_dirty(m); if (PTE_ISREFERENCED(pte)) { mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); tlb0_flush_entry(pv->pv_va); *pte &= ~PTE_REFERENCED; tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); if (++count >= PMAP_TS_REFERENCED_MAX) { PMAP_UNLOCK(pv->pv_pmap); break; } } } PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); return (count); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range must * have the wired attribute set. In contrast, invalid mappings cannot have * the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, so * there is no need to invalidate any TLB entries. */ static void mmu_booke_unwire(mmu_t mmu, pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va; pte_t *pte; PMAP_LOCK(pmap); for (va = sva; va < eva; va += PAGE_SIZE) { if ((pte = pte_find(mmu, pmap, va)) != NULL && PTE_ISVALID(pte)) { if (!PTE_ISWIRED(pte)) panic("mmu_booke_unwire: pte %p isn't wired", pte); *pte &= ~PTE_WIRED; pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Return true if the pmap's pv is one of the first 16 pvs linked to from this * page. This count may be changed upwards or downwards in the future; it is * only necessary that true be returned for a small subset of pmaps for proper * page aging. */ static boolean_t mmu_booke_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("mmu_booke_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { if (pv->pv_pmap == pmap) { rv = TRUE; break; } if (++loops >= 16) break; } rw_wunlock(&pvh_global_lock); return (rv); } /* * Return the number of managed mappings to the given physical page that are * wired. */ static int mmu_booke_page_wired_mappings(mmu_t mmu, vm_page_t m) { pv_entry_t pv; pte_t *pte; int count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) { PMAP_LOCK(pv->pv_pmap); if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL) if (PTE_ISVALID(pte) && PTE_ISWIRED(pte)) count++; PMAP_UNLOCK(pv->pv_pmap); } rw_wunlock(&pvh_global_lock); return (count); } static int mmu_booke_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { int i; vm_offset_t va; /* * This currently does not work for entries that * overlap TLB1 entries. */ for (i = 0; i < TLB1_ENTRIES; i ++) { if (tlb1_iomapped(i, pa, size, &va) == 0) return (0); } return (EFAULT); } void mmu_booke_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va) { vm_paddr_t ppa; vm_offset_t ofs; vm_size_t gran; /* Minidumps are based on virtual memory addresses. */ if (do_minidump) { *va = (void *)(vm_offset_t)pa; return; } /* Raw physical memory dumps don't have a virtual address. */ /* We always map a 256MB page at 256M. */ gran = 256 * 1024 * 1024; ppa = rounddown2(pa, gran); ofs = pa - ppa; *va = (void *)gran; tlb1_set_entry((vm_offset_t)va, ppa, gran, _TLB_ENTRY_IO); if (sz > (gran - ofs)) tlb1_set_entry((vm_offset_t)(va + gran), ppa + gran, gran, _TLB_ENTRY_IO); } void mmu_booke_dumpsys_unmap(mmu_t mmu, vm_paddr_t pa, size_t sz, void *va) { vm_paddr_t ppa; vm_offset_t ofs; vm_size_t gran; tlb_entry_t e; int i; /* Minidumps are based on virtual memory addresses. */ /* Nothing to do... */ if (do_minidump) return; for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) break; } /* Raw physical memory dumps don't have a virtual address. */ i--; e.mas1 = 0; e.mas2 = 0; e.mas3 = 0; tlb1_write_entry(&e, i); gran = 256 * 1024 * 1024; ppa = rounddown2(pa, gran); ofs = pa - ppa; if (sz > (gran - ofs)) { i--; e.mas1 = 0; e.mas2 = 0; e.mas3 = 0; tlb1_write_entry(&e, i); } } extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1]; void mmu_booke_scan_init(mmu_t mmu) { vm_offset_t va; pte_t *pte; int i; if (!do_minidump) { /* Initialize phys. segments for dumpsys(). */ memset(&dump_map, 0, sizeof(dump_map)); mem_regions(&physmem_regions, &physmem_regions_sz, &availmem_regions, &availmem_regions_sz); for (i = 0; i < physmem_regions_sz; i++) { dump_map[i].pa_start = physmem_regions[i].mr_start; dump_map[i].pa_size = physmem_regions[i].mr_size; } return; } /* Virtual segments for minidumps: */ memset(&dump_map, 0, sizeof(dump_map)); /* 1st: kernel .data and .bss. */ dump_map[0].pa_start = trunc_page((uintptr_t)_etext); dump_map[0].pa_size = round_page((uintptr_t)_end) - dump_map[0].pa_start; /* 2nd: msgbuf and tables (see pmap_bootstrap()). */ dump_map[1].pa_start = data_start; dump_map[1].pa_size = data_end - data_start; /* 3rd: kernel VM. */ va = dump_map[1].pa_start + dump_map[1].pa_size; /* Find start of next chunk (from va). */ while (va < virtual_end) { /* Don't dump the buffer cache. */ if (va >= kmi.buffer_sva && va < kmi.buffer_eva) { va = kmi.buffer_eva; continue; } pte = pte_find(mmu, kernel_pmap, va); if (pte != NULL && PTE_ISVALID(pte)) break; va += PAGE_SIZE; } if (va < virtual_end) { dump_map[2].pa_start = va; va += PAGE_SIZE; /* Find last page in chunk. */ while (va < virtual_end) { /* Don't run into the buffer cache. */ if (va == kmi.buffer_sva) break; pte = pte_find(mmu, kernel_pmap, va); if (pte == NULL || !PTE_ISVALID(pte)) break; va += PAGE_SIZE; } dump_map[2].pa_size = va - dump_map[2].pa_start; } } /* * Map a set of physical memory pages into the kernel virtual address space. * Return a pointer to where it is mapped. This routine is intended to be used * for mapping device memory, NOT real memory. */ static void * mmu_booke_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size) { return (mmu_booke_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT)); } static void * mmu_booke_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { tlb_entry_t e; void *res; uintptr_t va, tmpva; vm_size_t sz; int i; /* * Check if this is premapped in TLB1. Note: this should probably also * check whether a sequence of TLB1 entries exist that match the * requirement, but now only checks the easy case. */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (pa >= e.phys && (pa + size) <= (e.phys + e.size) && (ma == VM_MEMATTR_DEFAULT || tlb_calc_wimg(pa, ma) == (e.mas2 & (MAS2_WIMGE_MASK & ~_TLB_ENTRY_SHARED)))) return (void *)(e.virt + (vm_offset_t)(pa - e.phys)); } size = roundup(size, PAGE_SIZE); /* * The device mapping area is between VM_MAXUSER_ADDRESS and * VM_MIN_KERNEL_ADDRESS. This gives 1GB of device addressing. */ #ifdef SPARSE_MAPDEV /* * With a sparse mapdev, align to the largest starting region. This * could feasibly be optimized for a 'best-fit' alignment, but that * calculation could be very costly. * Align to the smaller of: * - first set bit in overlap of (pa & size mask) * - largest size envelope * * It's possible the device mapping may start at a PA that's not larger * than the size mask, so we need to offset in to maximize the TLB entry * range and minimize the number of used TLB entries. */ do { tmpva = tlb1_map_base; sz = ffsl(((1 << flsl(size-1)) - 1) & pa); sz = sz ? min(roundup(sz + 3, 4), flsl(size) - 1) : flsl(size) - 1; va = roundup(tlb1_map_base, 1 << sz) | (((1 << sz) - 1) & pa); #ifdef __powerpc64__ } while (!atomic_cmpset_long(&tlb1_map_base, tmpva, va + size)); #else } while (!atomic_cmpset_int(&tlb1_map_base, tmpva, va + size)); #endif #else #ifdef __powerpc64__ va = atomic_fetchadd_long(&tlb1_map_base, size); #else va = atomic_fetchadd_int(&tlb1_map_base, size); #endif #endif res = (void *)va; do { sz = 1 << (ilog2(size) & ~1); /* Align size to PA */ if (pa % sz != 0) { do { sz >>= 2; } while (pa % sz != 0); } /* Now align from there to VA */ if (va % sz != 0) { do { sz >>= 2; } while (va % sz != 0); } if (bootverbose) printf("Wiring VA=%lx to PA=%jx (size=%lx)\n", va, (uintmax_t)pa, sz); if (tlb1_set_entry(va, pa, sz, _TLB_ENTRY_SHARED | tlb_calc_wimg(pa, ma)) < 0) return (NULL); size -= sz; pa += sz; va += sz; } while (size > 0); return (res); } /* * 'Unmap' a range mapped by mmu_booke_mapdev(). */ static void mmu_booke_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size) { #ifdef SUPPORTS_SHRINKING_TLB1 vm_offset_t base, offset; /* * Unmap only if this is inside kernel virtual space. */ if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= VM_MAX_KERNEL_ADDRESS)) { base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); kva_free(base, size); } #endif } /* * mmu_booke_object_init_pt preloads the ptes for a given object into the * specified pmap. This eliminates the blast of soft faults on process startup * and immediately after an mmap. */ static void mmu_booke_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("mmu_booke_object_init_pt: non-device object")); } /* * Perform the pmap work for mincore. */ static int mmu_booke_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { /* XXX: this should be implemented at some point */ return (0); } static int mmu_booke_change_attr(mmu_t mmu, vm_offset_t addr, vm_size_t sz, vm_memattr_t mode) { vm_offset_t va; pte_t *pte; int i, j; tlb_entry_t e; /* Check TLB1 mappings */ for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (addr >= e.virt && addr < e.virt + e.size) break; } if (i < TLB1_ENTRIES) { /* Only allow full mappings to be modified for now. */ /* Validate the range. */ for (j = i, va = addr; va < addr + sz; va += e.size, j++) { tlb1_read_entry(&e, j); if (va != e.virt || (sz - (va - addr) < e.size)) return (EINVAL); } for (va = addr; va < addr + sz; va += e.size, i++) { tlb1_read_entry(&e, i); e.mas2 &= ~MAS2_WIMGE_MASK; e.mas2 |= tlb_calc_wimg(e.phys, mode); /* * Write it out to the TLB. Should really re-sync with other * cores. */ tlb1_write_entry(&e, i); } return (0); } /* Not in TLB1, try through pmap */ /* First validate the range. */ for (va = addr; va < addr + sz; va += PAGE_SIZE) { pte = pte_find(mmu, kernel_pmap, va); if (pte == NULL || !PTE_ISVALID(pte)) return (EINVAL); } mtx_lock_spin(&tlbivax_mutex); tlb_miss_lock(); for (va = addr; va < addr + sz; va += PAGE_SIZE) { pte = pte_find(mmu, kernel_pmap, va); *pte &= ~(PTE_MAS2_MASK << PTE_MAS2_SHIFT); *pte |= tlb_calc_wimg(PTE_PA(pte), mode) << PTE_MAS2_SHIFT; tlb0_flush_entry(va); } tlb_miss_unlock(); mtx_unlock_spin(&tlbivax_mutex); return (0); } /**************************************************************************/ /* TID handling */ /**************************************************************************/ /* * Allocate a TID. If necessary, steal one from someone else. * The new TID is flushed from the TLB before returning. */ static tlbtid_t tid_alloc(pmap_t pmap) { tlbtid_t tid; int thiscpu; KASSERT((pmap != kernel_pmap), ("tid_alloc: kernel pmap")); CTR2(KTR_PMAP, "%s: s (pmap = %p)", __func__, pmap); thiscpu = PCPU_GET(cpuid); tid = PCPU_GET(booke.tid_next); if (tid > TID_MAX) tid = TID_MIN; PCPU_SET(booke.tid_next, tid + 1); /* If we are stealing TID then clear the relevant pmap's field */ if (tidbusy[thiscpu][tid] != NULL) { CTR2(KTR_PMAP, "%s: warning: stealing tid %d", __func__, tid); tidbusy[thiscpu][tid]->pm_tid[thiscpu] = TID_NONE; /* Flush all entries from TLB0 matching this TID. */ tid_flush(tid); } tidbusy[thiscpu][tid] = pmap; pmap->pm_tid[thiscpu] = tid; __asm __volatile("msync; isync"); CTR3(KTR_PMAP, "%s: e (%02d next = %02d)", __func__, tid, PCPU_GET(booke.tid_next)); return (tid); } /**************************************************************************/ /* TLB0 handling */ /**************************************************************************/ /* Convert TLB0 va and way number to tlb0[] table index. */ static inline unsigned int tlb0_tableidx(vm_offset_t va, unsigned int way) { unsigned int idx; idx = (way * TLB0_ENTRIES_PER_WAY); idx += (va & MAS2_TLB0_ENTRY_IDX_MASK) >> MAS2_TLB0_ENTRY_IDX_SHIFT; return (idx); } /* * Invalidate TLB0 entry. */ static inline void tlb0_flush_entry(vm_offset_t va) { CTR2(KTR_PMAP, "%s: s va=0x%08x", __func__, va); mtx_assert(&tlbivax_mutex, MA_OWNED); __asm __volatile("tlbivax 0, %0" :: "r"(va & MAS2_EPN_MASK)); __asm __volatile("isync; msync"); __asm __volatile("tlbsync; msync"); CTR1(KTR_PMAP, "%s: e", __func__); } /**************************************************************************/ /* TLB1 handling */ /**************************************************************************/ /* * TLB1 mapping notes: * * TLB1[0] Kernel text and data. * TLB1[1-15] Additional kernel text and data mappings (if required), PCI * windows, other devices mappings. */ /* * Read an entry from given TLB1 slot. */ void tlb1_read_entry(tlb_entry_t *entry, unsigned int slot) { register_t msr; uint32_t mas0; KASSERT((entry != NULL), ("%s(): Entry is NULL!", __func__)); msr = mfmsr(); __asm __volatile("wrteei 0"); mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(slot); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); entry->mas1 = mfspr(SPR_MAS1); entry->mas2 = mfspr(SPR_MAS2); entry->mas3 = mfspr(SPR_MAS3); switch ((mfpvr() >> 16) & 0xFFFF) { case FSL_E500v2: case FSL_E500mc: case FSL_E5500: case FSL_E6500: entry->mas7 = mfspr(SPR_MAS7); break; default: entry->mas7 = 0; break; } __asm __volatile("wrtee %0" :: "r"(msr)); entry->virt = entry->mas2 & MAS2_EPN_MASK; entry->phys = ((vm_paddr_t)(entry->mas7 & MAS7_RPN) << 32) | (entry->mas3 & MAS3_RPN); entry->size = tsize2size((entry->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT); } struct tlbwrite_args { tlb_entry_t *e; unsigned int idx; }; static void tlb1_write_entry_int(void *arg) { struct tlbwrite_args *args = arg; uint32_t mas0; /* Select entry */ mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(args->idx); mtspr(SPR_MAS0, mas0); mtspr(SPR_MAS1, args->e->mas1); mtspr(SPR_MAS2, args->e->mas2); mtspr(SPR_MAS3, args->e->mas3); switch ((mfpvr() >> 16) & 0xFFFF) { case FSL_E500mc: case FSL_E5500: case FSL_E6500: mtspr(SPR_MAS8, 0); /* FALLTHROUGH */ case FSL_E500v2: mtspr(SPR_MAS7, args->e->mas7); break; default: break; } __asm __volatile("isync; tlbwe; isync; msync"); } static void tlb1_write_entry_sync(void *arg) { /* Empty synchronization point for smp_rendezvous(). */ } /* * Write given entry to TLB1 hardware. */ static void tlb1_write_entry(tlb_entry_t *e, unsigned int idx) { struct tlbwrite_args args; args.e = e; args.idx = idx; #ifdef SMP if ((e->mas2 & _TLB_ENTRY_SHARED) && smp_started) { mb(); smp_rendezvous(tlb1_write_entry_sync, tlb1_write_entry_int, tlb1_write_entry_sync, &args); } else #endif { register_t msr; msr = mfmsr(); __asm __volatile("wrteei 0"); tlb1_write_entry_int(&args); __asm __volatile("wrtee %0" :: "r"(msr)); } } /* * Return the largest uint value log such that 2^log <= num. */ static unsigned int ilog2(unsigned long num) { long lz; #ifdef __powerpc64__ __asm ("cntlzd %0, %1" : "=r" (lz) : "r" (num)); return (63 - lz); #else __asm ("cntlzw %0, %1" : "=r" (lz) : "r" (num)); return (31 - lz); #endif } /* * Convert TLB TSIZE value to mapped region size. */ static vm_size_t tsize2size(unsigned int tsize) { /* * size = 4^tsize KB * size = 4^tsize * 2^10 = 2^(2 * tsize - 10) */ return ((1 << (2 * tsize)) * 1024); } /* * Convert region size (must be power of 4) to TLB TSIZE value. */ static unsigned int size2tsize(vm_size_t size) { return (ilog2(size) / 2 - 5); } /* * Register permanent kernel mapping in TLB1. * * Entries are created starting from index 0 (current free entry is * kept in tlb1_idx) and are not supposed to be invalidated. */ int tlb1_set_entry(vm_offset_t va, vm_paddr_t pa, vm_size_t size, uint32_t flags) { tlb_entry_t e; uint32_t ts, tid; int tsize, index; for (index = 0; index < TLB1_ENTRIES; index++) { tlb1_read_entry(&e, index); if ((e.mas1 & MAS1_VALID) == 0) break; /* Check if we're just updating the flags, and update them. */ if (e.phys == pa && e.virt == va && e.size == size) { e.mas2 = (va & MAS2_EPN_MASK) | flags; tlb1_write_entry(&e, index); return (0); } } if (index >= TLB1_ENTRIES) { printf("tlb1_set_entry: TLB1 full!\n"); return (-1); } /* Convert size to TSIZE */ tsize = size2tsize(size); tid = (TID_KERNEL << MAS1_TID_SHIFT) & MAS1_TID_MASK; /* XXX TS is hard coded to 0 for now as we only use single address space */ ts = (0 << MAS1_TS_SHIFT) & MAS1_TS_MASK; e.phys = pa; e.virt = va; e.size = size; e.mas1 = MAS1_VALID | MAS1_IPROT | ts | tid; e.mas1 |= ((tsize << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK); e.mas2 = (va & MAS2_EPN_MASK) | flags; /* Set supervisor RWX permission bits */ e.mas3 = (pa & MAS3_RPN) | MAS3_SR | MAS3_SW | MAS3_SX; e.mas7 = (pa >> 32) & MAS7_RPN; tlb1_write_entry(&e, index); /* * XXX in general TLB1 updates should be propagated between CPUs, * since current design assumes to have the same TLB1 set-up on all * cores. */ return (0); } /* * Map in contiguous RAM region into the TLB1 using maximum of * KERNEL_REGION_MAX_TLB_ENTRIES entries. * * If necessary round up last entry size and return total size * used by all allocated entries. */ vm_size_t tlb1_mapin_region(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { vm_size_t pgs[KERNEL_REGION_MAX_TLB_ENTRIES]; vm_size_t mapped, pgsz, base, mask; int idx, nents; /* Round up to the next 1M */ size = roundup2(size, 1 << 20); mapped = 0; idx = 0; base = va; pgsz = 64*1024*1024; while (mapped < size) { while (mapped < size && idx < KERNEL_REGION_MAX_TLB_ENTRIES) { while (pgsz > (size - mapped)) pgsz >>= 2; pgs[idx++] = pgsz; mapped += pgsz; } /* We under-map. Correct for this. */ if (mapped < size) { while (pgs[idx - 1] == pgsz) { idx--; mapped -= pgsz; } /* XXX We may increase beyond out starting point. */ pgsz <<= 2; pgs[idx++] = pgsz; mapped += pgsz; } } nents = idx; mask = pgs[0] - 1; /* Align address to the boundary */ if (va & mask) { va = (va + mask) & ~mask; pa = (pa + mask) & ~mask; } for (idx = 0; idx < nents; idx++) { pgsz = pgs[idx]; debugf("%u: %llx -> %jx, size=%jx\n", idx, pa, (uintmax_t)va, (uintmax_t)pgsz); tlb1_set_entry(va, pa, pgsz, _TLB_ENTRY_SHARED | _TLB_ENTRY_MEM); pa += pgsz; va += pgsz; } mapped = (va - base); if (bootverbose) printf("mapped size 0x%"PRIxPTR" (wasted space 0x%"PRIxPTR")\n", mapped, mapped - size); return (mapped); } /* * TLB1 initialization routine, to be called after the very first * assembler level setup done in locore.S. */ void tlb1_init() { vm_offset_t mas2; uint32_t mas0, mas1, mas3, mas7; uint32_t tsz; tlb1_get_tlbconf(); mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(0); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); kernload = ((vm_paddr_t)(mas7 & MAS7_RPN) << 32) | (mas3 & MAS3_RPN); tsz = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; kernsize += (tsz > 0) ? tsize2size(tsz) : 0; kernstart = trunc_page(mas2); /* Setup TLB miss defaults */ set_mas4_defaults(); } /* * pmap_early_io_unmap() should be used in short conjunction with * pmap_early_io_map(), as in the following snippet: * * x = pmap_early_io_map(...); * * pmap_early_io_unmap(x, size); * * And avoiding more allocations between. */ void pmap_early_io_unmap(vm_offset_t va, vm_size_t size) { int i; tlb_entry_t e; vm_size_t isize; size = roundup(size, PAGE_SIZE); isize = size; for (i = 0; i < TLB1_ENTRIES && size > 0; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (va <= e.virt && (va + isize) >= (e.virt + e.size)) { size -= e.size; e.mas1 &= ~MAS1_VALID; tlb1_write_entry(&e, i); } } if (tlb1_map_base == va + isize) tlb1_map_base -= isize; } vm_offset_t pmap_early_io_map(vm_paddr_t pa, vm_size_t size) { vm_paddr_t pa_base; vm_offset_t va, sz; int i; tlb_entry_t e; KASSERT(!pmap_bootstrapped, ("Do not use after PMAP is up!")); for (i = 0; i < TLB1_ENTRIES; i++) { tlb1_read_entry(&e, i); if (!(e.mas1 & MAS1_VALID)) continue; if (pa >= e.phys && (pa + size) <= (e.phys + e.size)) return (e.virt + (pa - e.phys)); } pa_base = rounddown(pa, PAGE_SIZE); size = roundup(size + (pa - pa_base), PAGE_SIZE); tlb1_map_base = roundup2(tlb1_map_base, 1 << (ilog2(size) & ~1)); va = tlb1_map_base + (pa - pa_base); do { sz = 1 << (ilog2(size) & ~1); tlb1_set_entry(tlb1_map_base, pa_base, sz, _TLB_ENTRY_SHARED | _TLB_ENTRY_IO); size -= sz; pa_base += sz; tlb1_map_base += sz; } while (size > 0); return (va); } void pmap_track_page(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; vm_page_t page; struct pv_entry *pve; va = trunc_page(va); pa = pmap_kextract(va); page = PHYS_TO_VM_PAGE(pa); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH(pve, &page->md.pv_list, pv_link) { if ((pmap == pve->pv_pmap) && (va == pve->pv_va)) { goto out; } } page->md.pv_tracked = true; pv_insert(pmap, va, page); out: PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); } /* * Setup MAS4 defaults. * These values are loaded to MAS0-2 on a TLB miss. */ static void set_mas4_defaults(void) { uint32_t mas4; /* Defaults: TLB0, PID0, TSIZED=4K */ mas4 = MAS4_TLBSELD0; mas4 |= (TLB_SIZE_4K << MAS4_TSIZED_SHIFT) & MAS4_TSIZED_MASK; #ifdef SMP mas4 |= MAS4_MD; #endif mtspr(SPR_MAS4, mas4); __asm __volatile("isync"); } /* * Return 0 if the physical IO range is encompassed by one of the * the TLB1 entries, otherwise return related error code. */ static int tlb1_iomapped(int i, vm_paddr_t pa, vm_size_t size, vm_offset_t *va) { uint32_t prot; vm_paddr_t pa_start; vm_paddr_t pa_end; unsigned int entry_tsize; vm_size_t entry_size; tlb_entry_t e; *va = (vm_offset_t)NULL; tlb1_read_entry(&e, i); /* Skip invalid entries */ if (!(e.mas1 & MAS1_VALID)) return (EINVAL); /* * The entry must be cache-inhibited, guarded, and r/w * so it can function as an i/o page */ prot = e.mas2 & (MAS2_I | MAS2_G); if (prot != (MAS2_I | MAS2_G)) return (EPERM); prot = e.mas3 & (MAS3_SR | MAS3_SW); if (prot != (MAS3_SR | MAS3_SW)) return (EPERM); /* The address should be within the entry range. */ entry_tsize = (e.mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; KASSERT((entry_tsize), ("tlb1_iomapped: invalid entry tsize")); entry_size = tsize2size(entry_tsize); pa_start = (((vm_paddr_t)e.mas7 & MAS7_RPN) << 32) | (e.mas3 & MAS3_RPN); pa_end = pa_start + entry_size; if ((pa < pa_start) || ((pa + size) > pa_end)) return (ERANGE); /* Return virtual address of this mapping. */ *va = (e.mas2 & MAS2_EPN_MASK) + (pa - pa_start); return (0); } /* * Invalidate all TLB0 entries which match the given TID. Note this is * dedicated for cases when invalidations should NOT be propagated to other * CPUs. */ static void tid_flush(tlbtid_t tid) { register_t msr; uint32_t mas0, mas1, mas2; int entry, way; /* Don't evict kernel translations */ if (tid == TID_KERNEL) return; msr = mfmsr(); __asm __volatile("wrteei 0"); /* * Newer (e500mc and later) have tlbilx, which doesn't broadcast, so use * it for PID invalidation. */ switch ((mfpvr() >> 16) & 0xffff) { case FSL_E500mc: case FSL_E5500: case FSL_E6500: mtspr(SPR_MAS6, tid << MAS6_SPID0_SHIFT); /* tlbilxpid */ __asm __volatile("isync; .long 0x7c000024; isync; msync"); __asm __volatile("wrtee %0" :: "r"(msr)); return; } for (way = 0; way < TLB0_WAYS; way++) for (entry = 0; entry < TLB0_ENTRIES_PER_WAY; entry++) { mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way); mtspr(SPR_MAS0, mas0); mas2 = entry << MAS2_TLB0_ENTRY_IDX_SHIFT; mtspr(SPR_MAS2, mas2); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); if (!(mas1 & MAS1_VALID)) continue; if (((mas1 & MAS1_TID_MASK) >> MAS1_TID_SHIFT) != tid) continue; mas1 &= ~MAS1_VALID; mtspr(SPR_MAS1, mas1); __asm __volatile("isync; tlbwe; isync; msync"); } __asm __volatile("wrtee %0" :: "r"(msr)); } #ifdef DDB /* Print out contents of the MAS registers for each TLB0 entry */ static void #ifdef __powerpc64__ tlb_print_entry(int i, uint32_t mas1, uint64_t mas2, uint32_t mas3, #else tlb_print_entry(int i, uint32_t mas1, uint32_t mas2, uint32_t mas3, #endif uint32_t mas7) { int as; char desc[3]; tlbtid_t tid; vm_size_t size; unsigned int tsize; desc[2] = '\0'; if (mas1 & MAS1_VALID) desc[0] = 'V'; else desc[0] = ' '; if (mas1 & MAS1_IPROT) desc[1] = 'P'; else desc[1] = ' '; as = (mas1 & MAS1_TS_MASK) ? 1 : 0; tid = MAS1_GETTID(mas1); tsize = (mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; size = 0; if (tsize) size = tsize2size(tsize); printf("%3d: (%s) [AS=%d] " "sz = 0x%08x tsz = %d tid = %d mas1 = 0x%08x " "mas2(va) = 0x%"PRI0ptrX" mas3(pa) = 0x%08x mas7 = 0x%08x\n", i, desc, as, size, tsize, tid, mas1, mas2, mas3, mas7); } DB_SHOW_COMMAND(tlb0, tlb0_print_tlbentries) { uint32_t mas0, mas1, mas3, mas7; #ifdef __powerpc64__ uint64_t mas2; #else uint32_t mas2; #endif int entryidx, way, idx; printf("TLB0 entries:\n"); for (way = 0; way < TLB0_WAYS; way ++) for (entryidx = 0; entryidx < TLB0_ENTRIES_PER_WAY; entryidx++) { mas0 = MAS0_TLBSEL(0) | MAS0_ESEL(way); mtspr(SPR_MAS0, mas0); mas2 = entryidx << MAS2_TLB0_ENTRY_IDX_SHIFT; mtspr(SPR_MAS2, mas2); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); idx = tlb0_tableidx(mas2, way); tlb_print_entry(idx, mas1, mas2, mas3, mas7); } } /* * Print out contents of the MAS registers for each TLB1 entry */ DB_SHOW_COMMAND(tlb1, tlb1_print_tlbentries) { uint32_t mas0, mas1, mas3, mas7; #ifdef __powerpc64__ uint64_t mas2; #else uint32_t mas2; #endif int i; printf("TLB1 entries:\n"); for (i = 0; i < TLB1_ENTRIES; i++) { mas0 = MAS0_TLBSEL(1) | MAS0_ESEL(i); mtspr(SPR_MAS0, mas0); __asm __volatile("isync; tlbre"); mas1 = mfspr(SPR_MAS1); mas2 = mfspr(SPR_MAS2); mas3 = mfspr(SPR_MAS3); mas7 = mfspr(SPR_MAS7); tlb_print_entry(i, mas1, mas2, mas3, mas7); } } #endif Index: head/sys/riscv/riscv/pmap.c =================================================================== --- head/sys/riscv/riscv/pmap.c (revision 352406) +++ head/sys/riscv/riscv/pmap.c (revision 352407) @@ -1,4468 +1,4468 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2015-2018 Ruslan Bukin * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Portions of this software were developed by Andrew Turner under * sponsorship from The FreeBSD Foundation. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) #define NUL2E (Ln_ENTRIES * NUL1E) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* The list of all the user pmaps */ LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); static struct rwlock_padalign pvh_global_lock; static struct mtx_padalign allpmaps_lock; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN, &superpages_enabled, 0, "Enable support for transparent superpages"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; extern cpuset_t all_harts; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); #define pmap_clear(pte) pmap_store(pte, 0) #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) #define pmap_load_clear(pte) pmap_load_store(pte, 0) #define pmap_load(pte) atomic_load_64(pte) #define pmap_store(pte, entry) atomic_store_64(pte, entry) #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline void pagezero(void *p) { bzero(p, PAGE_SIZE); } #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { vm_paddr_t phys; pd_entry_t *l2; phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & PTE_V) == 0) return (NULL); if ((pmap_load(l1) & PTE_RX) != 0) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { vm_paddr_t phys; pt_entry_t *l3; phys = PTE_TO_PHYS(pmap_load(l2)); l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l3[pmap_l3_index(va)]); } static __inline pt_entry_t * pmap_l3(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2; l2 = pmap_l2(pmap, va); if (l2 == NULL) return (NULL); if ((pmap_load(l2) & PTE_V) == 0) return (NULL); if ((pmap_load(l2) & PTE_RX) != 0) return (NULL); return (pmap_l2_to_l3(l2, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, pt_entry_t entry) { struct pmap *user_pmap; pd_entry_t *l1; /* Distribute new kernel L1 entry to all the user pmaps */ if (pmap != kernel_pmap) return; mtx_lock(&allpmaps_lock); LIST_FOREACH(user_pmap, &allpmaps, pm_list) { l1 = &user_pmap->pm_l1[l1index]; pmap_store(l1, entry); } mtx_unlock(&allpmaps_lock); } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & PTE_RX) == 0, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_paddr_t ret; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); /* Check locore has used L2 superpages */ KASSERT((l2[l2_slot] & PTE_RX) != 0, ("Invalid bootstrap L2 table")); /* L2 is superpages */ ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; ret += (va & L2_OFFSET); return (ret); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; pt_entry_t entry; pn_t pn; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; l1 = (pd_entry_t *)kern_l1; l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); /* superpages */ pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(&l1[l1_slot], entry); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; sfence_vma(); } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; pt_entry_t entry; pd_entry_t *l2; vm_paddr_t pa; u_int l2_slot; pn_t pn; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pn = (pa / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(&l2[l2_slot], entry); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return (l3pt); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot, avail_slot, map_slot; vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t end, max_pa, min_pa, pa, start; int i; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; PMAP_LOCK_INIT(kernel_pmap); rw_init(&pvh_global_lock, "pmap pv global"); CPU_FILL(&kernel_pmap->pm_active); /* Assume the address we were loaded to is a valid physical address. */ min_pa = max_pa = kernstart; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < physmap_idx * 2; i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } printf("physmap_idx %lx\n", physmap_idx); printf("min_pa %lx\n", min_pa); printf("max_pa %lx\n", max_pa); /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); /* Create the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); sfence_vma(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L2_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); /* Initialize phys_avail and dump_avail. */ for (avail_slot = map_slot = physmem = 0; map_slot < physmap_idx * 2; map_slot += 2) { start = physmap[map_slot]; end = physmap[map_slot + 1]; if (start == end) continue; dump_avail[map_slot] = start; dump_avail[map_slot + 1] = end; realmem += atop((vm_offset_t)(end - start)); if (start >= kernstart && end <= pa) continue; if (start < kernstart && end > kernstart) end = kernstart; else if (start < pa && end > pa) start = pa; phys_avail[avail_slot] = start; phys_avail[avail_slot + 1] = end; physmem += (end - start) >> PAGE_SHIFT; avail_slot += 2; if (end != physmap[map_slot + 1] && end > pa) { phys_avail[avail_slot] = pa; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; physmem += (physmap[map_slot + 1] - pa) >> PAGE_SHIFT; avail_slot += 2; } } phys_avail[avail_slot] = 0; phys_avail[avail_slot + 1] = 0; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = atop(phys_avail[avail_slot - 1]); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Initialize the pv chunk and pmap list mutexes. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); if (superpages_enabled) pagesizes[1] = L2_SIZE; } #ifdef SMP /* * For SMP, these functions have to use IPIs for coherence. * * In general, the calling thread uses a plain fence to order the * writes to the page tables before invoking an SBI callback to invoke * sfence_vma() on remote CPUs. */ static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, va, 1); sfence_vma_page(va); sched_unpin(); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); sched_unpin(); } static void pmap_invalidate_all(pmap_t pmap) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); /* * XXX: The SBI doc doesn't detail how to specify x0 as the * address to perform a global fence. BBL currently treats * all sfence_vma requests as global however. */ fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, 0, 0); sfence_vma(); sched_unpin(); } #else /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sfence_vma_page(va); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sfence_vma(); } #endif /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; pa = 0; PMAP_LOCK(pmap); /* * Start with the l2 tabel. We are unable to allocate * pages in the l1 table. */ l2p = pmap_l2(pmap, va); if (l2p != NULL) { l2 = pmap_load(l2p); if ((l2 & PTE_RX) == 0) { l3p = pmap_l2_to_l3(l2p, va); if (l3p != NULL) { l3 = pmap_load(l3p); pa = PTE_TO_PHYS(l3); pa |= (va & L3_OFFSET); } } else { /* L2 is superpages */ pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *l3p, l3; vm_paddr_t phys; vm_page_t m; m = NULL; PMAP_LOCK(pmap); l3p = pmap_l3(pmap, va); if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { phys = PTE_TO_PHYS(l3); m = PHYS_TO_VM_PAGE(phys); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *l2; pt_entry_t *l3; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { l2 = pmap_l2(kernel_pmap, va); if (l2 == NULL) panic("pmap_kextract: No l2"); if ((pmap_load(l2) & PTE_RX) != 0) { /* superpages */ pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); return (pa); } l3 = pmap_l2_to_l3(l2, va); if (l3 == NULL) panic("pmap_kextract: No l3..."); pa = PTE_TO_PHYS(pmap_load(l3)); pa |= (va & PAGE_MASK); } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pt_entry_t entry; pt_entry_t *l3; vm_offset_t va; pn_t pn; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *l3; l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); sfence_vma(); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *l3; vm_offset_t va; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pmap_clear(l3); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *l3, pa; vm_offset_t va; vm_page_t m; pt_entry_t entry; pn_t pn; int i; va = sva; for (i = 0; i < count; i++) { m = ma[i]; pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); l3 = pmap_l3(kernel_pmap, va); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *l3; vm_offset_t va; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); for (va = sva; count-- > 0; va += PAGE_SIZE) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); } pmap_invalidate_range(kernel_pmap, sva, va); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "ml3" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, ml3)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else { return (FALSE); } } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { vm_paddr_t phys; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (m->pindex >= NUL1E) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); pmap_distribute_l1(pmap, pmap_l1_index(va), 0); } else { pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL1E) { pd_entry_t *l1; vm_page_t pdpg; l1 = pmap_l1(pmap, va); phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pmap_unwire_ptp(pmap, va, pdpg, free); } pmap_invalidate_page(pmap, va); vm_wire_sub(1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l1 = kernel_pmap->pm_l1; pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); CPU_ZERO(&pmap->pm_active); pmap_activate_boot(pmap); } int pmap_pinit(pmap_t pmap) { vm_paddr_t l1phys; vm_page_t l1pt; /* * allocate the l1 page */ while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l1phys = VM_PAGE_TO_PHYS(l1pt); pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); if ((l1pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l1); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); CPU_ZERO(&pmap->pm_active); /* Install kernel pagetables */ memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); /* Add to the list of all user pmaps */ mtx_lock(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock(&allpmaps_lock); vm_radix_init(&pmap->pm_root); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, /*pdppg, */pdpg; pt_entry_t entry; vm_paddr_t phys; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); rw_runlock(&pvh_global_lock); vm_wait(NULL); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUL1E) { pd_entry_t *l1; vm_pindex_t l1index; l1index = ptepindex - NUL1E; l1 = &pmap->pm_l1[l1index]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, l1index, entry); } else { vm_pindex_t l1index; pd_entry_t *l1, *l2; l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); l1 = &pmap->pm_l1[l1index]; if (pmap_load(l1) == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL1E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pdpg->wire_count++; } phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); l2pg->wire_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *l2; vm_paddr_t phys; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); m = PHYS_TO_VM_PAGE(phys); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock(&allpmaps_lock); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); vm_page_unwire_noq(m); vm_page_free(m); } #if 0 static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); #endif /* 0 */ /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l1, *l2; pt_entry_t entry; pn_t pn; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l1 = pmap_l1(kernel_pmap, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(kernel_pmap, pmap_l1_index(kernel_vm_end), entry); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & PTE_V) != 0 && (pmap_load(l2) & PTE_RWX) == 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) { pmap_zero_page(nkpg); } paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("RISCVTODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } /* XXX PV STATS */ #if 0 dump_add_page(m->phys_addr); #endif pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void __unused pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; int bit, field; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va &= ~L2_OFFSET; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining 511 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field] != 0) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } /* XXX PV stats */ } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; rw_assert(&pvh_global_lock, RA_LOCKED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_promote_l2: misaligned va %#lx", va)); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); m = PHYS_TO_VM_PAGE(pa); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = PTE_TO_PHYS(l2e); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | PTE_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldl2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); oldl2 = pmap_load_clear(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); /* * The sfence.vma documentation states that it is sufficient to specify * a single address within a superpage mapping. However, since we do * not perform any invalidation upon promotion, TLBs may still be * caching 4KB mappings within the superpage, so we must invalidate the * entire range. */ pmap_invalidate_range(pmap, sva, sva + L2_SIZE); if ((oldl2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if ((oldl2 & PTE_SW_MANAGED) != 0) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); va < eva; va += PAGE_SIZE, m++) { if ((oldl2 & PTE_D) != 0) vm_page_dirty(m); if ((oldl2 & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == Ln_ENTRIES, ("pmap_remove_l2: l3 page wire count error")); ml3->wire_count = 1; vm_page_unwire_noq(ml3); pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { pt_entry_t old_l3; vm_paddr_t phys; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & PTE_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & PTE_SW_MANAGED) { phys = PTE_TO_PHYS(old_l3); m = PHYS_TO_VM_PAGE(phys); if ((old_l3 & PTE_D) != 0) vm_page_dirty(m); if (old_l3 & PTE_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct spglist free; struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { (void)pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { /* * The large page mapping was destroyed. */ continue; } l2e = pmap_load(l2); } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct spglist free; struct md_page *pvh; pmap_t pmap; pt_entry_t *l3, l3e; pd_entry_t *l2, l2e; pv_entry_t pv; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_wlock(&pvh_global_lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; l2 = pmap_l2(pmap, va); (void)pmap_demote_l2(pmap, l2, va); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); l2 = pmap_l2(pmap, pv->pv_va); KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); l2e = pmap_load(l2); KASSERT((l2e & PTE_RX) == 0, ("pmap_remove_all: found a superpage in %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load_clear(l3); pmap_invalidate_page(pmap, pv->pv_va); if (l3e & PTE_SW_WIRED) pmap->pm_stats.wired_count--; if ((l3e & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((l3e & PTE_D) != 0) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e, mask; vm_page_t m, mt; vm_paddr_t pa; vm_offset_t va_next; bool anychanged, pv_lists_locked; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; anychanged = false; pv_lists_locked = false; mask = 0; if ((prot & VM_PROT_WRITE) == 0) mask |= PTE_W | PTE_D; if ((prot & VM_PROT_EXECUTE) == 0) mask |= PTE_X; resume: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL || (l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { retryl2: if ((prot & VM_PROT_WRITE) == 0 && (l2e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { pa = PTE_TO_PHYS(l2e); m = PHYS_TO_VM_PAGE(pa); for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) goto retryl2; anychanged = true; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); goto resume; } } if (!pmap_demote_l2(pmap, l2, sva)) { /* * The large page mapping was destroyed. */ continue; } } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { l3e = pmap_load(l3); retryl3: if ((l3e & PTE_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0 && (l3e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); vm_page_dirty(m); } if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) goto retryl3; anychanged = true; } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } int pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) { pd_entry_t *l2, l2e; pt_entry_t bits, *pte, oldpte; int rv; rv = 0; PMAP_LOCK(pmap); l2 = pmap_l2(pmap, va); if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) goto done; if ((l2e & PTE_RWX) == 0) { pte = pmap_l2_to_l3(l2, va); if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0) goto done; } else { pte = l2; oldpte = l2e; } if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) goto done; bits = PTE_A; if (ftype == VM_PROT_WRITE) bits |= PTE_D; /* * Spurious faults can occur if the implementation caches invalid * entries in the TLB, or if simultaneous accesses on multiple CPUs * race with each other. */ if ((oldpte & bits) != bits) pmap_store_bits(pte, bits); sfence_vma(); rv = 1; done: PMAP_UNLOCK(pmap); return (rv); } static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) { struct rwlock *lock; bool rv; lock = NULL; rv = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { struct spglist free; vm_page_t mpte; pd_entry_t newl2, oldl2; pt_entry_t *firstl3, newl3; vm_paddr_t mptepa; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl2 = pmap_load(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_l2_locked: " "failure for va %#lx in pmap %p", va, pmap); return (false); } if (va < VM_MAXUSER_ADDRESS) { mpte->wire_count = Ln_ENTRIES; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; KASSERT((oldl2 & PTE_A) != 0, ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); newl3 = oldl2; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) { for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); } KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " "addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the L2 entry. Otherwise, the * state of the L2 entry and the PV lists will be inconsistent, which * can result in reclaim_pv_chunk() attempting to remove a PV entry from * the wrong PV list and pmap_pv_demote_l2() failing to find the * expected PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & PTE_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Demote the mapping. */ pmap_store(l2, newl2); /* * Demote the PV entry. */ if ((oldl2 & PTE_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", va, pmap); return (true); } #if VM_NRESERVLEVEL > 0 static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3; vm_paddr_t pa; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); va &= ~L2_OFFSET; KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_promote_l2: invalid l2 entry %p", l2)); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); pa = PTE_TO_PHYS(pmap_load(firstl3)); if ((pa & L2_OFFSET) != 0) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { if (PTE_TO_PHYS(pmap_load(l3)) != pa) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((pmap_load(l3) & PTE_PROMOTE) != (pmap_load(firstl3) & PTE_PROMOTE)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; } ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); KASSERT(ml3->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, ml3, true)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), lockp); pmap_store(l2, pmap_load(firstl3)); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *l1, *l2, l2e; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l2_pa, l3_pa; vm_page_t mpte, om, l2_m, l3_m; pt_entry_t entry; pn_t l2_pn, l3_pn, pn; int rv; bool nosleep; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); new_l3 = PTE_V | PTE_R | PTE_A; if (prot & VM_PROT_EXECUTE) new_l3 |= PTE_X; if (flags & VM_PROT_WRITE) new_l3 |= PTE_D; if (prot & VM_PROT_WRITE) new_l3 |= PTE_W; if (va < VM_MAX_USER_ADDRESS) new_l3 |= PTE_U; new_l3 |= (pn << PTE_PPN0_S); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= PTE_SW_WIRED; /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if (prot & VM_PROT_WRITE) new_l3 |= PTE_D; } else new_l3 |= PTE_SW_MANAGED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; mpte = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va %#lx unaligned", va)); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); goto out; } l2 = pmap_l2(pmap, va); if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, va, &lock))) { l3 = pmap_l2_to_l3(l2, va); if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } l3 = pmap_l3(pmap, va); } else { l3 = pmap_l3(pmap, va); /* TODO: This is not optimal, but should mostly work */ if (l3 == NULL) { if (l2 == NULL) { l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); l2_pn = (l2_pa / PAGE_SIZE); l1 = pmap_l1(pmap, va); entry = (PTE_V); entry |= (l2_pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, pmap_l1_index(va), entry); l2 = pmap_l1_to_l2(l1, va); } l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); l3_pn = (l3_pa / PAGE_SIZE); entry = (PTE_V); entry |= (l3_pn << PTE_PPN0_S); pmap_store(l2, entry); l3 = pmap_l2_to_l3(l2, va); } pmap_invalidate_page(pmap, va); } orig_l3 = pmap_load(l3); opa = PTE_TO_PHYS(orig_l3); pv = NULL; /* * Is the specified virtual address already mapped? */ if ((orig_l3 & PTE_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & PTE_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & PTE_SW_MANAGED) != 0 && (new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ orig_l3 = pmap_load_clear(l3); KASSERT(PTE_TO_PHYS(orig_l3) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & PTE_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((orig_l3 & PTE_D) != 0) vm_page_dirty(om); if ((orig_l3 & PTE_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); - if ((vm_page_aflags(om) & PGA_WRITEABLE) == 0 && + if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((new_l3 & PTE_SW_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); /* * Update the L3 entry. */ if (orig_l3 != 0) { orig_l3 = pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); KASSERT(PTE_TO_PHYS(orig_l3) == pa, ("pmap_enter: invalid update")); if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == (PTE_D | PTE_SW_MANAGED)) vm_page_dirty(m); } else { pmap_store(l3, new_l3); } #if VM_NRESERVLEVEL > 0 if (mpte != NULL && mpte->wire_count == Ln_ENTRIES && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_l2(pmap, l2, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); if ((m->oflags & VPO_UNMANAGED) == 0) new_l2 |= PTE_SW_MANAGED; if ((prot & VM_PROT_EXECUTE) != 0) new_l2 |= PTE_X; if (va < VM_MAXUSER_ADDRESS) new_l2 |= PTE_U; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, *l3, oldl2; vm_offset_t sva; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((oldl2 = pmap_load(l2)) != 0) { KASSERT(l2pg->wire_count > 1, ("pmap_enter_l2: l2pg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((oldl2 & PTE_RWX) != 0) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { l3 = pmap_l2_to_l3(l2, sva); if ((pmap_load(l3) & PTE_V) != 0 && pmap_remove_l3(pmap, l3, sva, oldl2, &free, lockp) != 0) break; } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & PTE_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, paging-structure * caches could nonetheless have entries that * refer to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & PTE_W) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; vm_paddr_t phys; pd_entry_t *l2; pt_entry_t *l3, newl3; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); mpte = PHYS_TO_VM_PAGE(phys); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; l3 = pmap_l3(kernel_pmap, va); } if (l3 == NULL) panic("pmap_enter_quick_locked: No l3"); if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, false); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | PTE_V | PTE_R; if ((prot & VM_PROT_EXECUTE) != 0) newl3 |= PTE_X; if ((m->oflags & VPO_UNMANAGED) == 0) newl3 |= PTE_SW_MANAGED; if (va < VM_MAX_USER_ADDRESS) newl3 |= PTE_U; /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); pmap_store(l3, newl3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e; bool pv_lists_locked; pv_lists_locked = false; retry: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { if ((l2e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l2e); pmap_clear_bits(l2, PTE_SW_WIRED); continue; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); /* Repeat sva. */ goto retry; } } if (!pmap_demote_l2(pmap, l2, sva)) panic("pmap_unwire: demotion failed"); } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if ((l3e = pmap_load(l3)) == 0) continue; if ((l3e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l3e); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ pmap_clear_bits(l3, PTE_SW_WIRED); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); if ((pmap_load(l3) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); if ((pmap_load(l2) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } static void pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, struct spglist *free, bool superpage) { struct md_page *pvh; vm_page_t mpte, mt; if (superpage) { pmap_resident_count_dec(pmap, Ln_ENTRIES); pvh = pa_to_pvh(m->phys_addr); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[Ln_ENTRIES]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list) && - (vm_page_aflags(mt) & PGA_WRITEABLE) != 0) + (mt->aflags & PGA_WRITEABLE) != 0) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == Ln_ENTRIES, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && - (vm_page_aflags(m) & PGA_WRITEABLE) != 0) { + (m->aflags & PGA_WRITEABLE) != 0) { pvh = pa_to_pvh(m->phys_addr); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { struct spglist free; pd_entry_t ptepde; pt_entry_t *pte, tpte; vm_page_t m, mt; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; bool superpage; lock = NULL; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_l1(pmap, pv->pv_va); ptepde = pmap_load(pte); pte = pmap_l1_to_l2(pte, pv->pv_va); tpte = pmap_load(pte); if ((tpte & PTE_RWX) != 0) { superpage = true; } else { ptepde = tpte; pte = pmap_l2_to_l3(pte, pv->pv_va); tpte = pmap_load(pte); superpage = false; } /* * We cannot remove wired pages from a * process' mapping at this time. */ if (tpte & PTE_SW_WIRED) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { if (superpage) for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; pmap_remove_pages_pv(pmap, m, pv, &free, superpage); pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } static bool pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct md_page *pvh; struct rwlock *lock; pd_entry_t *l2; pt_entry_t *l3, mask; pv_entry_t pv; pmap_t pmap; int md_gen, pvh_gen; bool rv; mask = 0; if (modified) mask |= PTE_D; if (accessed) mask |= PTE_A; rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); rv = (pmap_load(l3) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); rv = (pmap_load(l2) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *l3; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); l3 = pmap_l3(pmap, addr); if (l3 != NULL && pmap_load(l3) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3, oldl3, newl3; pv_entry_t next_pv, pv; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_rlock(&pvh_global_lock); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); if ((pmap_load(l2) & PTE_W) != 0) (void)pmap_demote_l2_locked(pmap, l2, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3 = pmap_l3(pmap, pv->pv_va); oldl3 = pmap_load(l3); retry: if ((oldl3 & PTE_W) != 0) { newl3 = oldl3 & ~(PTE_D | PTE_W); if (!atomic_fcmpset_long(l3, &oldl3, newl3)) goto retry; if ((oldl3 & PTE_D) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); rw_runlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct spglist free; struct md_page *pvh; struct rwlock *lock; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *l2, l2e; pt_entry_t *l3, l3e; vm_paddr_t pa; vm_offset_t va; int cleared, md_gen, not_cleared, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); l2e = pmap_load(l2); if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { /* * Although l2e is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((l2e & PTE_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (l2e & PTE_SW_WIRED) == 0) { pmap_clear_bits(l2, PTE_A); pmap_invalidate_page(pmap, va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RX) == 0, ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load(l3); if ((l3e & PTE_D) != 0) vm_page_dirty(m); if ((l3e & PTE_A) != 0) { if ((l3e & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_clear_bits(l3, PTE_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(&pvh_global_lock); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has PTE_W set, then it also has PTE_D set. */ if ((oldl2 & PTE_W) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & PTE_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); l3 = pmap_l2_to_l3(l2, va); pmap_clear_bits(l3, PTE_D | PTE_W); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_clear_modify: found a 2mpage in page %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { pmap_clear_bits(l3, PTE_D | PTE_W); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); rw_runlock(&pvh_global_lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *l2, *l3, tpte; vm_paddr_t pa; int val; bool managed; PMAP_LOCK(pmap); retry: managed = false; val = 0; l2 = pmap_l2(pmap, addr); if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { if ((tpte & PTE_RWX) != 0) { pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); val = MINCORE_INCORE | MINCORE_SUPER; } else { l3 = pmap_l2_to_l3(l2, addr); tpte = pmap_load(l3); if ((tpte & PTE_V) == 0) goto done; pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); val = MINCORE_INCORE; } if ((tpte & PTE_D) != 0) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & PTE_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; } done: if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int hart; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (pmap == oldpmap) return; load_satp(pmap->pm_satp); hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); CPU_CLR(hart, &oldpmap->pm_active); #endif PCPU_SET(curpmap, pmap); sfence_vma(); } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { u_int hart; hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { cpuset_t mask; /* * From the RISC-V User-Level ISA V2.2: * * "To make a store to instruction memory visible to all * RISC-V harts, the writing hart has to execute a data FENCE * before requesting that all remote RISC-V harts execute a * FENCE.I." */ sched_pin(); mask = all_harts; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_fence_i(mask.__bits); sched_unpin(); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l1p, *l2p; /* Get l1 directory entry. */ l1p = pmap_l1(pmap, va); *l1 = l1p; if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) return (false); if ((pmap_load(l1p) & PTE_RX) != 0) { *l2 = NULL; *l3 = NULL; return (true); } /* Get l2 directory entry. */ l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) return (false); if ((pmap_load(l2p) & PTE_RX) != 0) { *l3 = NULL; return (true); } /* Get l3 page table entry. */ *l3 = pmap_l2_to_l3(l2p, va); return (true); } Index: head/sys/sparc64/sparc64/pmap.c =================================================================== --- head/sys/sparc64/sparc64/pmap.c (revision 352406) +++ head/sys/sparc64/sparc64/pmap.c (revision 352407) @@ -1,2320 +1,2320 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is also stored by the * logical address mapping module, this module may throw away valid virtual * to physical mappings at almost any time. However, invalidations of * mappings must be done as requested. * * In order to cope with hardware architectures which make virtual to * physical map invalidates expensive, this module may delay invalidate * reduced protection operations until such time as they are actually * necessary. This module is given full information as to which processors * are currently using which maps, and to when physical maps must be made * correct. */ #include "opt_kstack_pages.h" #include "opt_pmap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Map of physical memory reagions */ static struct ofw_mem_region mra[VM_PHYSSEG_MAX]; struct ofw_mem_region sparc64_memreg[VM_PHYSSEG_MAX]; int sparc64_nmemreg; static struct ofw_map translations[VM_PHYSSEG_MAX]; static int translations_size; static vm_offset_t pmap_idle_map; static vm_offset_t pmap_temp_map_1; static vm_offset_t pmap_temp_map_2; /* * First and last available kernel virtual addresses */ vm_offset_t virtual_avail; vm_offset_t virtual_end; vm_offset_t kernel_vm_end; vm_offset_t vm_max_kernel_address; /* * Kernel pmap */ struct pmap kernel_pmap_store; struct rwlock_padalign tte_list_global_lock; /* * Allocate physical memory for use in pmap_bootstrap. */ static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size, uint32_t colors); static void pmap_bootstrap_set_tte(struct tte *tp, u_long vpn, u_long data); static void pmap_cache_remove(vm_page_t m, vm_offset_t va); static int pmap_protect_tte(struct pmap *pm1, struct pmap *pm2, struct tte *tp, vm_offset_t va); static int pmap_unwire_tte(pmap_t pm, pmap_t pm2, struct tte *tp, vm_offset_t va); static void pmap_init_qpages(void); /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. * * The page queues and pmap must be locked. */ static int pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind); extern int tl1_dmmu_miss_direct_patch_tsb_phys_1[]; extern int tl1_dmmu_miss_direct_patch_tsb_phys_end_1[]; extern int tl1_dmmu_miss_patch_asi_1[]; extern int tl1_dmmu_miss_patch_quad_ldd_1[]; extern int tl1_dmmu_miss_patch_tsb_1[]; extern int tl1_dmmu_miss_patch_tsb_2[]; extern int tl1_dmmu_miss_patch_tsb_mask_1[]; extern int tl1_dmmu_miss_patch_tsb_mask_2[]; extern int tl1_dmmu_prot_patch_asi_1[]; extern int tl1_dmmu_prot_patch_quad_ldd_1[]; extern int tl1_dmmu_prot_patch_tsb_1[]; extern int tl1_dmmu_prot_patch_tsb_2[]; extern int tl1_dmmu_prot_patch_tsb_mask_1[]; extern int tl1_dmmu_prot_patch_tsb_mask_2[]; extern int tl1_immu_miss_patch_asi_1[]; extern int tl1_immu_miss_patch_quad_ldd_1[]; extern int tl1_immu_miss_patch_tsb_1[]; extern int tl1_immu_miss_patch_tsb_2[]; extern int tl1_immu_miss_patch_tsb_mask_1[]; extern int tl1_immu_miss_patch_tsb_mask_2[]; /* * If user pmap is processed with pmap_remove and with pmap_remove and the * resident count drops to 0, there are no more pages to remove, so we * need not continue. */ #define PMAP_REMOVE_DONE(pm) \ ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0) /* * The threshold (in bytes) above which tsb_foreach() is used in pmap_remove() * and pmap_protect() instead of trying each virtual address. */ #define PMAP_TSB_THRESH ((TSB_SIZE / 2) * PAGE_SIZE) SYSCTL_NODE(_debug, OID_AUTO, pmap_stats, CTLFLAG_RD, 0, ""); PMAP_STATS_VAR(pmap_nenter); PMAP_STATS_VAR(pmap_nenter_update); PMAP_STATS_VAR(pmap_nenter_replace); PMAP_STATS_VAR(pmap_nenter_new); PMAP_STATS_VAR(pmap_nkenter); PMAP_STATS_VAR(pmap_nkenter_oc); PMAP_STATS_VAR(pmap_nkenter_stupid); PMAP_STATS_VAR(pmap_nkremove); PMAP_STATS_VAR(pmap_nqenter); PMAP_STATS_VAR(pmap_nqremove); PMAP_STATS_VAR(pmap_ncache_enter); PMAP_STATS_VAR(pmap_ncache_enter_c); PMAP_STATS_VAR(pmap_ncache_enter_oc); PMAP_STATS_VAR(pmap_ncache_enter_cc); PMAP_STATS_VAR(pmap_ncache_enter_coc); PMAP_STATS_VAR(pmap_ncache_enter_nc); PMAP_STATS_VAR(pmap_ncache_enter_cnc); PMAP_STATS_VAR(pmap_ncache_remove); PMAP_STATS_VAR(pmap_ncache_remove_c); PMAP_STATS_VAR(pmap_ncache_remove_oc); PMAP_STATS_VAR(pmap_ncache_remove_cc); PMAP_STATS_VAR(pmap_ncache_remove_coc); PMAP_STATS_VAR(pmap_ncache_remove_nc); PMAP_STATS_VAR(pmap_nzero_page); PMAP_STATS_VAR(pmap_nzero_page_c); PMAP_STATS_VAR(pmap_nzero_page_oc); PMAP_STATS_VAR(pmap_nzero_page_nc); PMAP_STATS_VAR(pmap_nzero_page_area); PMAP_STATS_VAR(pmap_nzero_page_area_c); PMAP_STATS_VAR(pmap_nzero_page_area_oc); PMAP_STATS_VAR(pmap_nzero_page_area_nc); PMAP_STATS_VAR(pmap_ncopy_page); PMAP_STATS_VAR(pmap_ncopy_page_c); PMAP_STATS_VAR(pmap_ncopy_page_oc); PMAP_STATS_VAR(pmap_ncopy_page_nc); PMAP_STATS_VAR(pmap_ncopy_page_dc); PMAP_STATS_VAR(pmap_ncopy_page_doc); PMAP_STATS_VAR(pmap_ncopy_page_sc); PMAP_STATS_VAR(pmap_ncopy_page_soc); PMAP_STATS_VAR(pmap_nnew_thread); PMAP_STATS_VAR(pmap_nnew_thread_oc); static inline u_long dtlb_get_data(u_int tlb, u_int slot); /* * Quick sort callout for comparing memory regions */ static int mr_cmp(const void *a, const void *b); static int om_cmp(const void *a, const void *b); static int mr_cmp(const void *a, const void *b) { const struct ofw_mem_region *mra; const struct ofw_mem_region *mrb; mra = a; mrb = b; if (mra->mr_start < mrb->mr_start) return (-1); else if (mra->mr_start > mrb->mr_start) return (1); else return (0); } static int om_cmp(const void *a, const void *b) { const struct ofw_map *oma; const struct ofw_map *omb; oma = a; omb = b; if (oma->om_start < omb->om_start) return (-1); else if (oma->om_start > omb->om_start) return (1); else return (0); } static inline u_long dtlb_get_data(u_int tlb, u_int slot) { u_long data; register_t s; slot = TLB_DAR_SLOT(tlb, slot); /* * We read ASI_DTLB_DATA_ACCESS_REG twice back-to-back in order to * work around errata of USIII and beyond. */ s = intr_disable(); (void)ldxa(slot, ASI_DTLB_DATA_ACCESS_REG); data = ldxa(slot, ASI_DTLB_DATA_ACCESS_REG); intr_restore(s); return (data); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(u_int cpu_impl) { struct pmap *pm; struct tte *tp; vm_offset_t off; vm_offset_t va; vm_paddr_t pa; vm_size_t physsz; vm_size_t virtsz; u_long data; u_long vpn; phandle_t pmem; phandle_t vmem; u_int dtlb_slots_avail; int i; int j; int sz; uint32_t asi; uint32_t colors; uint32_t ldd; /* * Set the kernel context. */ pmap_set_kctx(); colors = dcache_color_ignore != 0 ? 1 : DCACHE_COLORS; /* * Find out what physical memory is available from the PROM and * initialize the phys_avail array. This must be done before * pmap_bootstrap_alloc is called. */ if ((pmem = OF_finddevice("/memory")) == -1) OF_panic("%s: finddevice /memory", __func__); if ((sz = OF_getproplen(pmem, "available")) == -1) OF_panic("%s: getproplen /memory/available", __func__); if (PHYS_AVAIL_ENTRIES < sz) OF_panic("%s: phys_avail too small", __func__); if (sizeof(mra) < sz) OF_panic("%s: mra too small", __func__); bzero(mra, sz); if (OF_getprop(pmem, "available", mra, sz) == -1) OF_panic("%s: getprop /memory/available", __func__); sz /= sizeof(*mra); #ifdef DIAGNOSTIC OF_printf("pmap_bootstrap: physical memory\n"); #endif qsort(mra, sz, sizeof (*mra), mr_cmp); physsz = 0; getenv_quad("hw.physmem", &physmem); physmem = btoc(physmem); for (i = 0, j = 0; i < sz; i++, j += 2) { #ifdef DIAGNOSTIC OF_printf("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size); #endif if (physmem != 0 && btoc(physsz + mra[i].mr_size) >= physmem) { if (btoc(physsz) < physmem) { phys_avail[j] = mra[i].mr_start; phys_avail[j + 1] = mra[i].mr_start + (ctob(physmem) - physsz); physsz = ctob(physmem); } break; } phys_avail[j] = mra[i].mr_start; phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size; physsz += mra[i].mr_size; } physmem = btoc(physsz); /* * Calculate the size of kernel virtual memory, and the size and mask * for the kernel TSB based on the phsyical memory size but limited * by the amount of dTLB slots available for locked entries if we have * to lock the TSB in the TLB (given that for spitfire-class CPUs all * of the dt64 slots can hold locked entries but there is no large * dTLB for unlocked ones, we don't use more than half of it for the * TSB). * Note that for reasons unknown OpenSolaris doesn't take advantage of * ASI_ATOMIC_QUAD_LDD_PHYS on UltraSPARC-III. However, given that no * public documentation is available for these, the latter just might * not support it, yet. */ if (cpu_impl == CPU_IMPL_SPARC64V || cpu_impl >= CPU_IMPL_ULTRASPARCIIIp) { tsb_kernel_ldd_phys = 1; virtsz = roundup(5 / 3 * physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT)); } else { dtlb_slots_avail = 0; for (i = 0; i < dtlb_slots; i++) { data = dtlb_get_data(cpu_impl == CPU_IMPL_ULTRASPARCIII ? TLB_DAR_T16 : TLB_DAR_T32, i); if ((data & (TD_V | TD_L)) != (TD_V | TD_L)) dtlb_slots_avail++; } #ifdef SMP dtlb_slots_avail -= PCPU_PAGES; #endif if (cpu_impl >= CPU_IMPL_ULTRASPARCI && cpu_impl < CPU_IMPL_ULTRASPARCIII) dtlb_slots_avail /= 2; virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT)); virtsz = MIN(virtsz, (dtlb_slots_avail * PAGE_SIZE_4M) << (PAGE_SHIFT - TTE_SHIFT)); } vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz; tsb_kernel_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT); tsb_kernel_mask = (tsb_kernel_size >> TTE_SHIFT) - 1; /* * Allocate the kernel TSB and lock it in the TLB if necessary. */ pa = pmap_bootstrap_alloc(tsb_kernel_size, colors); if (pa & PAGE_MASK_4M) OF_panic("%s: TSB unaligned", __func__); tsb_kernel_phys = pa; if (tsb_kernel_ldd_phys == 0) { tsb_kernel = (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size); pmap_map_tsb(); bzero(tsb_kernel, tsb_kernel_size); } else { tsb_kernel = (struct tte *)TLB_PHYS_TO_DIRECT(tsb_kernel_phys); aszero(ASI_PHYS_USE_EC, tsb_kernel_phys, tsb_kernel_size); } /* * Allocate and map the dynamic per-CPU area for the BSP. */ pa = pmap_bootstrap_alloc(DPCPU_SIZE, colors); dpcpu0 = (void *)TLB_PHYS_TO_DIRECT(pa); /* * Allocate and map the message buffer. */ pa = pmap_bootstrap_alloc(msgbufsize, colors); msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(pa); /* * Patch the TSB addresses and mask as well as the ASIs used to load * it into the trap table. */ #define LDDA_R_I_R(rd, imm_asi, rs1, rs2) \ (EIF_OP(IOP_LDST) | EIF_F3_RD(rd) | EIF_F3_OP3(INS3_LDDA) | \ EIF_F3_RS1(rs1) | EIF_F3_I(0) | EIF_F3_IMM_ASI(imm_asi) | \ EIF_F3_RS2(rs2)) #define OR_R_I_R(rd, imm13, rs1) \ (EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) | \ EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13)) #define SETHI(rd, imm22) \ (EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) | \ EIF_IMM((imm22) >> 10, 22)) #define WR_R_I(rd, imm13, rs1) \ (EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_WR) | \ EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13)) #define PATCH_ASI(addr, asi) do { \ if (addr[0] != WR_R_I(IF_F3_RD(addr[0]), 0x0, \ IF_F3_RS1(addr[0]))) \ OF_panic("%s: patched instructions have changed", \ __func__); \ addr[0] |= EIF_IMM((asi), 13); \ flush(addr); \ } while (0) #define PATCH_LDD(addr, asi) do { \ if (addr[0] != LDDA_R_I_R(IF_F3_RD(addr[0]), 0x0, \ IF_F3_RS1(addr[0]), IF_F3_RS2(addr[0]))) \ OF_panic("%s: patched instructions have changed", \ __func__); \ addr[0] |= EIF_F3_IMM_ASI(asi); \ flush(addr); \ } while (0) #define PATCH_TSB(addr, val) do { \ if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) || \ addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0, \ IF_F3_RS1(addr[1])) || \ addr[3] != SETHI(IF_F2_RD(addr[3]), 0x0)) \ OF_panic("%s: patched instructions have changed", \ __func__); \ addr[0] |= EIF_IMM((val) >> 42, 22); \ addr[1] |= EIF_IMM((val) >> 32, 10); \ addr[3] |= EIF_IMM((val) >> 10, 22); \ flush(addr); \ flush(addr + 1); \ flush(addr + 3); \ } while (0) #define PATCH_TSB_MASK(addr, val) do { \ if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) || \ addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0, \ IF_F3_RS1(addr[1]))) \ OF_panic("%s: patched instructions have changed", \ __func__); \ addr[0] |= EIF_IMM((val) >> 10, 22); \ addr[1] |= EIF_IMM((val), 10); \ flush(addr); \ flush(addr + 1); \ } while (0) if (tsb_kernel_ldd_phys == 0) { asi = ASI_N; ldd = ASI_NUCLEUS_QUAD_LDD; off = (vm_offset_t)tsb_kernel; } else { asi = ASI_PHYS_USE_EC; ldd = ASI_ATOMIC_QUAD_LDD_PHYS; off = (vm_offset_t)tsb_kernel_phys; } PATCH_TSB(tl1_dmmu_miss_direct_patch_tsb_phys_1, tsb_kernel_phys); PATCH_TSB(tl1_dmmu_miss_direct_patch_tsb_phys_end_1, tsb_kernel_phys + tsb_kernel_size - 1); PATCH_ASI(tl1_dmmu_miss_patch_asi_1, asi); PATCH_LDD(tl1_dmmu_miss_patch_quad_ldd_1, ldd); PATCH_TSB(tl1_dmmu_miss_patch_tsb_1, off); PATCH_TSB(tl1_dmmu_miss_patch_tsb_2, off); PATCH_TSB_MASK(tl1_dmmu_miss_patch_tsb_mask_1, tsb_kernel_mask); PATCH_TSB_MASK(tl1_dmmu_miss_patch_tsb_mask_2, tsb_kernel_mask); PATCH_ASI(tl1_dmmu_prot_patch_asi_1, asi); PATCH_LDD(tl1_dmmu_prot_patch_quad_ldd_1, ldd); PATCH_TSB(tl1_dmmu_prot_patch_tsb_1, off); PATCH_TSB(tl1_dmmu_prot_patch_tsb_2, off); PATCH_TSB_MASK(tl1_dmmu_prot_patch_tsb_mask_1, tsb_kernel_mask); PATCH_TSB_MASK(tl1_dmmu_prot_patch_tsb_mask_2, tsb_kernel_mask); PATCH_ASI(tl1_immu_miss_patch_asi_1, asi); PATCH_LDD(tl1_immu_miss_patch_quad_ldd_1, ldd); PATCH_TSB(tl1_immu_miss_patch_tsb_1, off); PATCH_TSB(tl1_immu_miss_patch_tsb_2, off); PATCH_TSB_MASK(tl1_immu_miss_patch_tsb_mask_1, tsb_kernel_mask); PATCH_TSB_MASK(tl1_immu_miss_patch_tsb_mask_2, tsb_kernel_mask); /* * Enter fake 8k pages for the 4MB kernel pages, so that * pmap_kextract() will work for them. */ for (i = 0; i < kernel_tlb_slots; i++) { pa = kernel_tlbs[i].te_pa; va = kernel_tlbs[i].te_va; for (off = 0; off < PAGE_SIZE_4M; off += PAGE_SIZE) { tp = tsb_kvtotte(va + off); vpn = TV_VPN(va + off, TS_8K); data = TD_V | TD_8K | TD_PA(pa + off) | TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W; pmap_bootstrap_set_tte(tp, vpn, data); } } /* * Set the start and end of KVA. The kernel is loaded starting * at the first available 4MB super page, so we advance to the * end of the last one used for it. */ virtual_avail = KERNBASE + kernel_tlb_slots * PAGE_SIZE_4M; virtual_end = vm_max_kernel_address; kernel_vm_end = vm_max_kernel_address; /* * Allocate kva space for temporary mappings. */ pmap_idle_map = virtual_avail; virtual_avail += PAGE_SIZE * colors; pmap_temp_map_1 = virtual_avail; virtual_avail += PAGE_SIZE * colors; pmap_temp_map_2 = virtual_avail; virtual_avail += PAGE_SIZE * colors; /* * Allocate a kernel stack with guard page for thread0 and map it * into the kernel TSB. We must ensure that the virtual address is * colored properly for corresponding CPUs, since we're allocating * from phys_avail so the memory won't have an associated vm_page_t. */ pa = pmap_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, colors); kstack0_phys = pa; virtual_avail += roundup(KSTACK_GUARD_PAGES, colors) * PAGE_SIZE; kstack0 = virtual_avail; virtual_avail += roundup(KSTACK_PAGES, colors) * PAGE_SIZE; if (dcache_color_ignore == 0) KASSERT(DCACHE_COLOR(kstack0) == DCACHE_COLOR(kstack0_phys), ("pmap_bootstrap: kstack0 miscolored")); for (i = 0; i < KSTACK_PAGES; i++) { pa = kstack0_phys + i * PAGE_SIZE; va = kstack0 + i * PAGE_SIZE; tp = tsb_kvtotte(va); vpn = TV_VPN(va, TS_8K); data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W; pmap_bootstrap_set_tte(tp, vpn, data); } /* * Calculate the last available physical address. */ for (i = 0; phys_avail[i + 2] != 0; i += 2) ; Maxmem = sparc64_btop(phys_avail[i + 1]); /* * Add the PROM mappings to the kernel TSB. */ if ((vmem = OF_finddevice("/virtual-memory")) == -1) OF_panic("%s: finddevice /virtual-memory", __func__); if ((sz = OF_getproplen(vmem, "translations")) == -1) OF_panic("%s: getproplen translations", __func__); if (sizeof(translations) < sz) OF_panic("%s: translations too small", __func__); bzero(translations, sz); if (OF_getprop(vmem, "translations", translations, sz) == -1) OF_panic("%s: getprop /virtual-memory/translations", __func__); sz /= sizeof(*translations); translations_size = sz; #ifdef DIAGNOSTIC OF_printf("pmap_bootstrap: translations\n"); #endif qsort(translations, sz, sizeof (*translations), om_cmp); for (i = 0; i < sz; i++) { #ifdef DIAGNOSTIC OF_printf("translation: start=%#lx size=%#lx tte=%#lx\n", translations[i].om_start, translations[i].om_size, translations[i].om_tte); #endif if ((translations[i].om_tte & TD_V) == 0) continue; if (translations[i].om_start < VM_MIN_PROM_ADDRESS || translations[i].om_start > VM_MAX_PROM_ADDRESS) continue; for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) { va = translations[i].om_start + off; tp = tsb_kvtotte(va); vpn = TV_VPN(va, TS_8K); data = ((translations[i].om_tte & ~((TD_SOFT2_MASK << TD_SOFT2_SHIFT) | (cpu_impl >= CPU_IMPL_ULTRASPARCI && cpu_impl < CPU_IMPL_ULTRASPARCIII ? (TD_DIAG_SF_MASK << TD_DIAG_SF_SHIFT) : (TD_RSVD_CH_MASK << TD_RSVD_CH_SHIFT)) | (TD_SOFT_MASK << TD_SOFT_SHIFT))) | TD_EXEC) + off; pmap_bootstrap_set_tte(tp, vpn, data); } } /* * Get the available physical memory ranges from /memory/reg. These * are only used for kernel dumps, but it may not be wise to do PROM * calls in that situation. */ if ((sz = OF_getproplen(pmem, "reg")) == -1) OF_panic("%s: getproplen /memory/reg", __func__); if (sizeof(sparc64_memreg) < sz) OF_panic("%s: sparc64_memreg too small", __func__); if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1) OF_panic("%s: getprop /memory/reg", __func__); sparc64_nmemreg = sz / sizeof(*sparc64_memreg); /* * Initialize the kernel pmap (which is statically allocated). */ pm = kernel_pmap; PMAP_LOCK_INIT(pm); for (i = 0; i < MAXCPU; i++) pm->pm_context[i] = TLB_CTX_KERNEL; CPU_FILL(&pm->pm_active); /* * Initialize the global tte list lock, which is more commonly * known as the pmap pv global lock. */ rw_init(&tte_list_global_lock, "pmap pv global"); /* * Flush all non-locked TLB entries possibly left over by the * firmware. */ tlb_flush_nonlocked(); } static void pmap_init_qpages(void) { struct pcpu *pc; int i; if (dcache_color_ignore != 0) return; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE * DCACHE_COLORS); if (pc->pc_qmap_addr == 0) panic("pmap_init_qpages: unable to allocate KVA"); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL); /* * Map the 4MB kernel TSB pages. */ void pmap_map_tsb(void) { vm_offset_t va; vm_paddr_t pa; u_long data; int i; for (i = 0; i < tsb_kernel_size; i += PAGE_SIZE_4M) { va = (vm_offset_t)tsb_kernel + i; pa = tsb_kernel_phys + i; data = TD_V | TD_4M | TD_PA(pa) | TD_L | TD_CP | TD_CV | TD_P | TD_W; stxa(AA_DMMU_TAR, ASI_DMMU, TLB_TAR_VA(va) | TLB_TAR_CTX(TLB_CTX_KERNEL)); stxa_sync(0, ASI_DTLB_DATA_IN_REG, data); } } /* * Set the secondary context to be the kernel context (needed for FP block * operations in the kernel). */ void pmap_set_kctx(void) { stxa(AA_DMMU_SCXR, ASI_DMMU, (ldxa(AA_DMMU_SCXR, ASI_DMMU) & TLB_CXR_PGSZ_MASK) | TLB_CTX_KERNEL); flush(KERNBASE); } /* * Allocate a physical page of memory directly from the phys_avail map. * Can only be called from pmap_bootstrap before avail start and end are * calculated. */ static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size, uint32_t colors) { vm_paddr_t pa; int i; size = roundup(size, PAGE_SIZE * colors); for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i + 1] - phys_avail[i] < size) continue; pa = phys_avail[i]; phys_avail[i] += size; return (pa); } OF_panic("%s: no suitable region found", __func__); } /* * Set a TTE. This function is intended as a helper when tsb_kernel is * direct-mapped but we haven't taken over the trap table, yet, as it's the * case when we are taking advantage of ASI_ATOMIC_QUAD_LDD_PHYS to access * the kernel TSB. */ void pmap_bootstrap_set_tte(struct tte *tp, u_long vpn, u_long data) { if (tsb_kernel_ldd_phys == 0) { tp->tte_vpn = vpn; tp->tte_data = data; } else { stxa((vm_paddr_t)tp + offsetof(struct tte, tte_vpn), ASI_PHYS_USE_EC, vpn); stxa((vm_paddr_t)tp + offsetof(struct tte, tte_data), ASI_PHYS_USE_EC, data); } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.tte_list); m->md.color = DCACHE_COLOR(VM_PAGE_TO_PHYS(m)); m->md.pmap = NULL; } /* * Initialize the pmap module. */ void pmap_init(void) { vm_offset_t addr; vm_size_t size; int result; int i; for (i = 0; i < translations_size; i++) { addr = translations[i].om_start; size = translations[i].om_size; if ((translations[i].om_tte & TD_V) == 0) continue; if (addr < VM_MIN_PROM_ADDRESS || addr > VM_MAX_PROM_ADDRESS) continue; result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); if (result != KERN_SUCCESS || addr != translations[i].om_start) panic("pmap_init: vm_map_find"); } } /* * Extract the physical page address associated with the given * map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pm, vm_offset_t va) { struct tte *tp; vm_paddr_t pa; if (pm == kernel_pmap) return (pmap_kextract(va)); PMAP_LOCK(pm); tp = tsb_tte_lookup(pm, va); if (tp == NULL) pa = 0; else pa = TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp)); PMAP_UNLOCK(pm); return (pa); } /* * Atomically extract and hold the physical page with the given * pmap and virtual address pair if that mapping permits the given * protection. */ vm_page_t pmap_extract_and_hold(pmap_t pm, vm_offset_t va, vm_prot_t prot) { struct tte *tp; vm_page_t m; m = NULL; PMAP_LOCK(pm); if (pm == kernel_pmap) { if (va >= VM_MIN_DIRECT_ADDRESS) { tp = NULL; m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS(va)); if (!vm_page_wire_mapped(m)) m = NULL; } else { tp = tsb_kvtotte(va); if ((tp->tte_data & TD_V) == 0) tp = NULL; } } else tp = tsb_tte_lookup(pm, va); if (tp != NULL && ((tp->tte_data & TD_SW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); if (!vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pm); return (m); } /* * Extract the physical page address associated with the given kernel virtual * address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { struct tte *tp; if (va >= VM_MIN_DIRECT_ADDRESS) return (TLB_DIRECT_TO_PHYS(va)); tp = tsb_kvtotte(va); if ((tp->tte_data & TD_V) == 0) return (0); return (TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp))); } int pmap_cache_enter(vm_page_t m, vm_offset_t va) { struct tte *tp; int color; rw_assert(&tte_list_global_lock, RA_WLOCKED); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_cache_enter: fake page")); PMAP_STATS_INC(pmap_ncache_enter); if (dcache_color_ignore != 0) return (1); /* * Find the color for this virtual address and note the added mapping. */ color = DCACHE_COLOR(va); m->md.colors[color]++; /* * If all existing mappings have the same color, the mapping is * cacheable. */ if (m->md.color == color) { KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] == 0, ("pmap_cache_enter: cacheable, mappings of other color")); if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_enter_c); else PMAP_STATS_INC(pmap_ncache_enter_oc); return (1); } /* * If there are no mappings of the other color, and the page still has * the wrong color, this must be a new mapping. Change the color to * match the new mapping, which is cacheable. We must flush the page * from the cache now. */ if (m->md.colors[DCACHE_OTHER_COLOR(color)] == 0) { KASSERT(m->md.colors[color] == 1, ("pmap_cache_enter: changing color, not new mapping")); dcache_page_inval(VM_PAGE_TO_PHYS(m)); m->md.color = color; if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_enter_cc); else PMAP_STATS_INC(pmap_ncache_enter_coc); return (1); } /* * If the mapping is already non-cacheable, just return. */ if (m->md.color == -1) { PMAP_STATS_INC(pmap_ncache_enter_nc); return (0); } PMAP_STATS_INC(pmap_ncache_enter_cnc); /* * Mark all mappings as uncacheable, flush any lines with the other * color out of the dcache, and set the color to none (-1). */ TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { atomic_clear_long(&tp->tte_data, TD_CV); tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } dcache_page_inval(VM_PAGE_TO_PHYS(m)); m->md.color = -1; return (0); } static void pmap_cache_remove(vm_page_t m, vm_offset_t va) { struct tte *tp; int color; rw_assert(&tte_list_global_lock, RA_WLOCKED); CTR3(KTR_PMAP, "pmap_cache_remove: m=%p va=%#lx c=%d", m, va, m->md.colors[DCACHE_COLOR(va)]); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_cache_remove: fake page")); PMAP_STATS_INC(pmap_ncache_remove); if (dcache_color_ignore != 0) return; KASSERT(m->md.colors[DCACHE_COLOR(va)] > 0, ("pmap_cache_remove: no mappings %d <= 0", m->md.colors[DCACHE_COLOR(va)])); /* * Find the color for this virtual address and note the removal of * the mapping. */ color = DCACHE_COLOR(va); m->md.colors[color]--; /* * If the page is cacheable, just return and keep the same color, even * if there are no longer any mappings. */ if (m->md.color != -1) { if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_remove_c); else PMAP_STATS_INC(pmap_ncache_remove_oc); return; } KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] != 0, ("pmap_cache_remove: uncacheable, no mappings of other color")); /* * If the page is not cacheable (color is -1), and the number of * mappings for this color is not zero, just return. There are * mappings of the other color still, so remain non-cacheable. */ if (m->md.colors[color] != 0) { PMAP_STATS_INC(pmap_ncache_remove_nc); return; } /* * The number of mappings for this color is now zero. Recache the * other colored mappings, and change the page color to the other * color. There should be no lines in the data cache for this page, * so flushing should not be needed. */ TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { atomic_set_long(&tp->tte_data, TD_CV); tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } m->md.color = DCACHE_OTHER_COLOR(color); if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m))) PMAP_STATS_INC(pmap_ncache_remove_cc); else PMAP_STATS_INC(pmap_ncache_remove_coc); } /* * Map a wired page into kernel virtual address space. */ void pmap_kenter(vm_offset_t va, vm_page_t m) { vm_offset_t ova; struct tte *tp; vm_page_t om; u_long data; rw_assert(&tte_list_global_lock, RA_WLOCKED); PMAP_STATS_INC(pmap_nkenter); tp = tsb_kvtotte(va); CTR4(KTR_PMAP, "pmap_kenter: va=%#lx pa=%#lx tp=%p data=%#lx", va, VM_PAGE_TO_PHYS(m), tp, tp->tte_data); if (DCACHE_COLOR(VM_PAGE_TO_PHYS(m)) != DCACHE_COLOR(va)) { CTR5(KTR_SPARE2, "pmap_kenter: off color va=%#lx pa=%#lx o=%p ot=%d pi=%#lx", va, VM_PAGE_TO_PHYS(m), m->object, m->object ? m->object->type : -1, m->pindex); PMAP_STATS_INC(pmap_nkenter_oc); } if ((tp->tte_data & TD_V) != 0) { om = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); ova = TTE_GET_VA(tp); if (m == om && va == ova) { PMAP_STATS_INC(pmap_nkenter_stupid); return; } TAILQ_REMOVE(&om->md.tte_list, tp, tte_link); pmap_cache_remove(om, ova); if (va != ova) tlb_page_demap(kernel_pmap, ova); } data = TD_V | TD_8K | VM_PAGE_TO_PHYS(m) | TD_REF | TD_SW | TD_CP | TD_P | TD_W; if (pmap_cache_enter(m, va) != 0) data |= TD_CV; tp->tte_vpn = TV_VPN(va, TS_8K); tp->tte_data = data; TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link); } /* * Map a wired page into kernel virtual address space. This additionally * takes a flag argument which is or'ed to the TTE data. This is used by * sparc64_bus_mem_map(). * NOTE: if the mapping is non-cacheable, it's the caller's responsibility * to flush entries that might still be in the cache, if applicable. */ void pmap_kenter_flags(vm_offset_t va, vm_paddr_t pa, u_long flags) { struct tte *tp; tp = tsb_kvtotte(va); CTR4(KTR_PMAP, "pmap_kenter_flags: va=%#lx pa=%#lx tp=%p data=%#lx", va, pa, tp, tp->tte_data); tp->tte_vpn = TV_VPN(va, TS_8K); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_P | flags; } /* * Remove a wired page from kernel virtual address space. */ void pmap_kremove(vm_offset_t va) { struct tte *tp; vm_page_t m; rw_assert(&tte_list_global_lock, RA_WLOCKED); PMAP_STATS_INC(pmap_nkremove); tp = tsb_kvtotte(va); CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp, tp->tte_data); if ((tp->tte_data & TD_V) == 0) return; m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); pmap_cache_remove(m, va); TTE_ZERO(tp); } /* * Inverse of pmap_kenter_flags, used by bus_space_unmap(). */ void pmap_kremove_flags(vm_offset_t va) { struct tte *tp; tp = tsb_kvtotte(va); CTR3(KTR_PMAP, "pmap_kremove_flags: va=%#lx tp=%p data=%#lx", va, tp, tp->tte_data); TTE_ZERO(tp); } /* * Map a range of physical addresses into kernel virtual address space. * * The value passed in *virt is a suggested virtual address for the mapping. * Architectures which can support a direct-mapped physical to virtual region * can return the appropriate address within that region, leaving '*virt' * unchanged. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return (TLB_PHYS_TO_DIRECT(start)); } /* * Map a list of wired pages into kernel virtual address space. This is * intended for temporary mappings which do not need page modification or * references recorded. Existing mappings in the region are overwritten. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; PMAP_STATS_INC(pmap_nqenter); va = sva; rw_wlock(&tte_list_global_lock); while (count-- > 0) { pmap_kenter(va, *m); va += PAGE_SIZE; m++; } rw_wunlock(&tte_list_global_lock); tlb_range_demap(kernel_pmap, sva, va); } /* * Remove page mappings from kernel virtual address space. Intended for * temporary mappings entered by pmap_qenter. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; PMAP_STATS_INC(pmap_nqremove); va = sva; rw_wlock(&tte_list_global_lock); while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } rw_wunlock(&tte_list_global_lock); tlb_range_demap(kernel_pmap, sva, va); } /* * Initialize the pmap associated with process 0. */ void pmap_pinit0(pmap_t pm) { int i; PMAP_LOCK_INIT(pm); for (i = 0; i < MAXCPU; i++) pm->pm_context[i] = TLB_CTX_KERNEL; CPU_ZERO(&pm->pm_active); pm->pm_tsb = NULL; pm->pm_tsb_obj = NULL; bzero(&pm->pm_stats, sizeof(pm->pm_stats)); } /* * Initialize a preallocated and zeroed pmap structure, such as one in a * vmspace structure. */ int pmap_pinit(pmap_t pm) { vm_page_t ma[TSB_PAGES]; int i; /* * Allocate KVA space for the TSB. */ if (pm->pm_tsb == NULL) { pm->pm_tsb = (struct tte *)kva_alloc(TSB_BSIZE); if (pm->pm_tsb == NULL) return (0); } /* * Allocate an object for it. */ if (pm->pm_tsb_obj == NULL) pm->pm_tsb_obj = vm_object_allocate(OBJT_PHYS, TSB_PAGES); for (i = 0; i < MAXCPU; i++) pm->pm_context[i] = -1; CPU_ZERO(&pm->pm_active); VM_OBJECT_WLOCK(pm->pm_tsb_obj); (void)vm_page_grab_pages(pm->pm_tsb_obj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO, ma, TSB_PAGES); VM_OBJECT_WUNLOCK(pm->pm_tsb_obj); for (i = 0; i < TSB_PAGES; i++) ma[i]->md.pmap = pm; pmap_qenter((vm_offset_t)pm->pm_tsb, ma, TSB_PAGES); bzero(&pm->pm_stats, sizeof(pm->pm_stats)); return (1); } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pm) { vm_object_t obj; vm_page_t m; #ifdef SMP struct pcpu *pc; #endif CTR2(KTR_PMAP, "pmap_release: ctx=%#x tsb=%p", pm->pm_context[curcpu], pm->pm_tsb); KASSERT(pmap_resident_count(pm) == 0, ("pmap_release: resident pages %ld != 0", pmap_resident_count(pm))); /* * After the pmap was freed, it might be reallocated to a new process. * When switching, this might lead us to wrongly assume that we need * not switch contexts because old and new pmap pointer are equal. * Therefore, make sure that this pmap is not referenced by any PCPU * pointer any more. This could happen in two cases: * - A process that referenced the pmap is currently exiting on a CPU. * However, it is guaranteed to not switch in any more after setting * its state to PRS_ZOMBIE. * - A process that referenced this pmap ran on a CPU, but we switched * to a kernel thread, leaving the pmap pointer unchanged. */ #ifdef SMP sched_pin(); STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) atomic_cmpset_rel_ptr((uintptr_t *)&pc->pc_pmap, (uintptr_t)pm, (uintptr_t)NULL); sched_unpin(); #else critical_enter(); if (PCPU_GET(pmap) == pm) PCPU_SET(pmap, NULL); critical_exit(); #endif pmap_qremove((vm_offset_t)pm->pm_tsb, TSB_PAGES); obj = pm->pm_tsb_obj; VM_OBJECT_WLOCK(obj); KASSERT(obj->ref_count == 1, ("pmap_release: tsbobj ref count != 1")); while (!TAILQ_EMPTY(&obj->memq)) { m = TAILQ_FIRST(&obj->memq); m->md.pmap = NULL; vm_page_unwire_noq(m); vm_page_free_zero(m); } VM_OBJECT_WUNLOCK(obj); } /* * Grow the number of kernel page table entries. Unneeded. */ void pmap_growkernel(vm_offset_t addr) { panic("pmap_growkernel: can't grow kernel"); } int pmap_remove_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp, vm_offset_t va) { vm_page_t m; u_long data; rw_assert(&tte_list_global_lock, RA_WLOCKED); data = atomic_readandclear_long(&tp->tte_data); if ((data & TD_FAKE) == 0) { m = PHYS_TO_VM_PAGE(TD_PA(data)); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); if ((data & TD_WIRED) != 0) pm->pm_stats.wired_count--; if ((data & TD_PV) != 0) { if ((data & TD_W) != 0) vm_page_dirty(m); if ((data & TD_REF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.tte_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pm->pm_stats.resident_count--; } pmap_cache_remove(m, va); } TTE_ZERO(tp); if (PMAP_REMOVE_DONE(pm)) return (0); return (1); } /* * Remove the given range of addresses from the specified map. */ void pmap_remove(pmap_t pm, vm_offset_t start, vm_offset_t end) { struct tte *tp; vm_offset_t va; CTR3(KTR_PMAP, "pmap_remove: ctx=%#lx start=%#lx end=%#lx", pm->pm_context[curcpu], start, end); if (PMAP_REMOVE_DONE(pm)) return; rw_wlock(&tte_list_global_lock); PMAP_LOCK(pm); if (end - start > PMAP_TSB_THRESH) { tsb_foreach(pm, NULL, start, end, pmap_remove_tte); tlb_context_demap(pm); } else { for (va = start; va < end; va += PAGE_SIZE) if ((tp = tsb_tte_lookup(pm, va)) != NULL && !pmap_remove_tte(pm, NULL, tp, va)) break; tlb_range_demap(pm, start, end - 1); } PMAP_UNLOCK(pm); rw_wunlock(&tte_list_global_lock); } void pmap_remove_all(vm_page_t m) { struct pmap *pm; struct tte *tpn; struct tte *tp; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); rw_wlock(&tte_list_global_lock); for (tp = TAILQ_FIRST(&m->md.tte_list); tp != NULL; tp = tpn) { tpn = TAILQ_NEXT(tp, tte_link); if ((tp->tte_data & TD_PV) == 0) continue; pm = TTE_GET_PMAP(tp); va = TTE_GET_VA(tp); PMAP_LOCK(pm); if ((tp->tte_data & TD_WIRED) != 0) pm->pm_stats.wired_count--; if ((tp->tte_data & TD_REF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); if ((tp->tte_data & TD_W) != 0) vm_page_dirty(m); tp->tte_data &= ~TD_V; tlb_page_demap(pm, va); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); pm->pm_stats.resident_count--; pmap_cache_remove(m, va); TTE_ZERO(tp); PMAP_UNLOCK(pm); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&tte_list_global_lock); } static int pmap_protect_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp, vm_offset_t va) { u_long data; vm_page_t m; PMAP_LOCK_ASSERT(pm, MA_OWNED); data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W); if ((data & (TD_PV | TD_W)) == (TD_PV | TD_W)) { m = PHYS_TO_VM_PAGE(TD_PA(data)); vm_page_dirty(m); } return (1); } /* * Set the physical protection on the specified range of this map as requested. */ void pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va; struct tte *tp; CTR4(KTR_PMAP, "pmap_protect: ctx=%#lx sva=%#lx eva=%#lx prot=%#lx", pm->pm_context[curcpu], sva, eva, prot); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pm, sva, eva); return; } if (prot & VM_PROT_WRITE) return; PMAP_LOCK(pm); if (eva - sva > PMAP_TSB_THRESH) { tsb_foreach(pm, NULL, sva, eva, pmap_protect_tte); tlb_context_demap(pm); } else { for (va = sva; va < eva; va += PAGE_SIZE) if ((tp = tsb_tte_lookup(pm, va)) != NULL) pmap_protect_tte(pm, NULL, tp, va); tlb_range_demap(pm, sva, eva - 1); } PMAP_UNLOCK(pm); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. */ int pmap_enter(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { int rv; rw_wlock(&tte_list_global_lock); PMAP_LOCK(pm); rv = pmap_enter_locked(pm, va, m, prot, flags, psind); rw_wunlock(&tte_list_global_lock); PMAP_UNLOCK(pm); return (rv); } /* * Map the given physical page at the specified virtual address in the * target pmap with the protection requested. If specified the page * will be wired down. * * The page queues and pmap must be locked. */ static int pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct tte *tp; vm_paddr_t pa; vm_page_t real; u_long data; boolean_t wired; rw_assert(&tte_list_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pm, MA_OWNED); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); PMAP_STATS_INC(pmap_nenter); pa = VM_PAGE_TO_PHYS(m); wired = (flags & PMAP_ENTER_WIRED) != 0; /* * If this is a fake page from the device_pager, but it covers actual * physical memory, convert to the real backing page. */ if ((m->flags & PG_FICTITIOUS) != 0) { real = vm_phys_paddr_to_vm_page(pa); if (real != NULL) m = real; } CTR6(KTR_PMAP, "pmap_enter_locked: ctx=%p m=%p va=%#lx pa=%#lx prot=%#x wired=%d", pm->pm_context[curcpu], m, va, pa, prot, wired); /* * If there is an existing mapping, and the physical address has not * changed, must be protection or wiring change. */ if ((tp = tsb_tte_lookup(pm, va)) != NULL && TTE_GET_PA(tp) == pa) { CTR0(KTR_PMAP, "pmap_enter_locked: update"); PMAP_STATS_INC(pmap_nenter_update); /* * Wiring change, just update stats. */ if (wired) { if ((tp->tte_data & TD_WIRED) == 0) { tp->tte_data |= TD_WIRED; pm->pm_stats.wired_count++; } } else { if ((tp->tte_data & TD_WIRED) != 0) { tp->tte_data &= ~TD_WIRED; pm->pm_stats.wired_count--; } } /* * Save the old bits and clear the ones we're interested in. */ data = tp->tte_data; tp->tte_data &= ~(TD_EXEC | TD_SW | TD_W); /* * If we're turning off write permissions, sense modify status. */ if ((prot & VM_PROT_WRITE) != 0) { tp->tte_data |= TD_SW; if (wired) tp->tte_data |= TD_W; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } else if ((data & TD_W) != 0) vm_page_dirty(m); /* * If we're turning on execute permissions, flush the icache. */ if ((prot & VM_PROT_EXECUTE) != 0) { if ((data & TD_EXEC) == 0) icache_page_inval(pa); tp->tte_data |= TD_EXEC; } /* * Delete the old mapping. */ tlb_page_demap(pm, TTE_GET_VA(tp)); } else { /* * If there is an existing mapping, but its for a different * physical address, delete the old mapping. */ if (tp != NULL) { CTR0(KTR_PMAP, "pmap_enter_locked: replace"); PMAP_STATS_INC(pmap_nenter_replace); pmap_remove_tte(pm, NULL, tp, va); tlb_page_demap(pm, va); } else { CTR0(KTR_PMAP, "pmap_enter_locked: new"); PMAP_STATS_INC(pmap_nenter_new); } /* * Now set up the data and install the new mapping. */ data = TD_V | TD_8K | TD_PA(pa); if (pm == kernel_pmap) data |= TD_P; if ((prot & VM_PROT_WRITE) != 0) { data |= TD_SW; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_WRITEABLE); } if (prot & VM_PROT_EXECUTE) { data |= TD_EXEC; icache_page_inval(pa); } /* * If its wired update stats. We also don't need reference or * modify tracking for wired mappings, so set the bits now. */ if (wired) { pm->pm_stats.wired_count++; data |= TD_REF | TD_WIRED; if ((prot & VM_PROT_WRITE) != 0) data |= TD_W; } tsb_tte_enter(pm, m, va, TS_8K, data); } return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_page_t m; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); m = m_start; rw_wlock(&tte_list_global_lock); PMAP_LOCK(pm); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { pmap_enter_locked(pm, start + ptoa(diff), m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), 0, 0); m = TAILQ_NEXT(m, listq); } rw_wunlock(&tte_list_global_lock); PMAP_UNLOCK(pm); } void pmap_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&tte_list_global_lock); PMAP_LOCK(pm); pmap_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE), 0, 0); rw_wunlock(&tte_list_global_lock); PMAP_UNLOCK(pm); } void pmap_object_init_pt(pmap_t pm, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } static int pmap_unwire_tte(pmap_t pm, pmap_t pm2, struct tte *tp, vm_offset_t va) { PMAP_LOCK_ASSERT(pm, MA_OWNED); if ((tp->tte_data & TD_WIRED) == 0) panic("pmap_unwire_tte: tp %p is missing TD_WIRED", tp); atomic_clear_long(&tp->tte_data, TD_WIRED); pm->pm_stats.wired_count--; return (1); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range must * have the wired attribute set. In contrast, invalid mappings cannot have * the wired attribute set, so they are ignored. * * The wired attribute of the translation table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va; struct tte *tp; PMAP_LOCK(pm); if (eva - sva > PMAP_TSB_THRESH) tsb_foreach(pm, NULL, sva, eva, pmap_unwire_tte); else { for (va = sva; va < eva; va += PAGE_SIZE) if ((tp = tsb_tte_lookup(pm, va)) != NULL) pmap_unwire_tte(pm, NULL, tp, va); } PMAP_UNLOCK(pm); } static int pmap_copy_tte(pmap_t src_pmap, pmap_t dst_pmap, struct tte *tp, vm_offset_t va) { vm_page_t m; u_long data; if ((tp->tte_data & TD_FAKE) != 0) return (1); if (tsb_tte_lookup(dst_pmap, va) == NULL) { data = tp->tte_data & ~(TD_PV | TD_REF | TD_SW | TD_CV | TD_W); m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp)); tsb_tte_enter(dst_pmap, m, va, TS_8K, data); } return (1); } void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct tte *tp; vm_offset_t va; if (dst_addr != src_addr) return; rw_wlock(&tte_list_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } if (len > PMAP_TSB_THRESH) { tsb_foreach(src_pmap, dst_pmap, src_addr, src_addr + len, pmap_copy_tte); tlb_context_demap(dst_pmap); } else { for (va = src_addr; va < src_addr + len; va += PAGE_SIZE) if ((tp = tsb_tte_lookup(src_pmap, va)) != NULL) pmap_copy_tte(src_pmap, dst_pmap, tp, va); tlb_range_demap(dst_pmap, src_addr, src_addr + len - 1); } rw_wunlock(&tte_list_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } void pmap_zero_page(vm_page_t m) { struct tte *tp; vm_offset_t va; vm_paddr_t pa; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_zero_page: fake page")); PMAP_STATS_INC(pmap_nzero_page); pa = VM_PAGE_TO_PHYS(m); if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa)) { PMAP_STATS_INC(pmap_nzero_page_c); va = TLB_PHYS_TO_DIRECT(pa); cpu_block_zero((void *)va, PAGE_SIZE); } else if (m->md.color == -1) { PMAP_STATS_INC(pmap_nzero_page_nc); aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_nzero_page_oc); PMAP_LOCK(kernel_pmap); va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE); tp = tsb_kvtotte(va); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(va, TS_8K); cpu_block_zero((void *)va, PAGE_SIZE); tlb_page_demap(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } void pmap_zero_page_area(vm_page_t m, int off, int size) { struct tte *tp; vm_offset_t va; vm_paddr_t pa; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("pmap_zero_page_area: fake page")); KASSERT(off + size <= PAGE_SIZE, ("pmap_zero_page_area: bad off/size")); PMAP_STATS_INC(pmap_nzero_page_area); pa = VM_PAGE_TO_PHYS(m); if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa)) { PMAP_STATS_INC(pmap_nzero_page_area_c); va = TLB_PHYS_TO_DIRECT(pa); bzero((void *)(va + off), size); } else if (m->md.color == -1) { PMAP_STATS_INC(pmap_nzero_page_area_nc); aszero(ASI_PHYS_USE_EC, pa + off, size); } else { PMAP_STATS_INC(pmap_nzero_page_area_oc); PMAP_LOCK(kernel_pmap); va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE); tp = tsb_kvtotte(va); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(va, TS_8K); bzero((void *)(va + off), size); tlb_page_demap(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t vdst; vm_offset_t vsrc; vm_paddr_t pdst; vm_paddr_t psrc; struct tte *tp; KASSERT((mdst->flags & PG_FICTITIOUS) == 0, ("pmap_copy_page: fake dst page")); KASSERT((msrc->flags & PG_FICTITIOUS) == 0, ("pmap_copy_page: fake src page")); PMAP_STATS_INC(pmap_ncopy_page); pdst = VM_PAGE_TO_PHYS(mdst); psrc = VM_PAGE_TO_PHYS(msrc); if (dcache_color_ignore != 0 || (msrc->md.color == DCACHE_COLOR(psrc) && mdst->md.color == DCACHE_COLOR(pdst))) { PMAP_STATS_INC(pmap_ncopy_page_c); vdst = TLB_PHYS_TO_DIRECT(pdst); vsrc = TLB_PHYS_TO_DIRECT(psrc); cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE); } else if (msrc->md.color == -1 && mdst->md.color == -1) { PMAP_STATS_INC(pmap_ncopy_page_nc); ascopy(ASI_PHYS_USE_EC, psrc, pdst, PAGE_SIZE); } else if (msrc->md.color == -1) { if (mdst->md.color == DCACHE_COLOR(pdst)) { PMAP_STATS_INC(pmap_ncopy_page_dc); vdst = TLB_PHYS_TO_DIRECT(pdst); ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_ncopy_page_doc); PMAP_LOCK(kernel_pmap); vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE); tp = tsb_kvtotte(vdst); tp->tte_data = TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vdst, TS_8K); ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst, PAGE_SIZE); tlb_page_demap(kernel_pmap, vdst); PMAP_UNLOCK(kernel_pmap); } } else if (mdst->md.color == -1) { if (msrc->md.color == DCACHE_COLOR(psrc)) { PMAP_STATS_INC(pmap_ncopy_page_sc); vsrc = TLB_PHYS_TO_DIRECT(psrc); ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst, PAGE_SIZE); } else { PMAP_STATS_INC(pmap_ncopy_page_soc); PMAP_LOCK(kernel_pmap); vsrc = pmap_temp_map_1 + (msrc->md.color * PAGE_SIZE); tp = tsb_kvtotte(vsrc); tp->tte_data = TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vsrc, TS_8K); ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst, PAGE_SIZE); tlb_page_demap(kernel_pmap, vsrc); PMAP_UNLOCK(kernel_pmap); } } else { PMAP_STATS_INC(pmap_ncopy_page_oc); PMAP_LOCK(kernel_pmap); vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE); tp = tsb_kvtotte(vdst); tp->tte_data = TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vdst, TS_8K); vsrc = pmap_temp_map_2 + (msrc->md.color * PAGE_SIZE); tp = tsb_kvtotte(vsrc); tp->tte_data = TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(vsrc, TS_8K); cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE); tlb_page_demap(kernel_pmap, vdst); tlb_page_demap(kernel_pmap, vsrc); PMAP_UNLOCK(kernel_pmap); } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t pa; vm_offset_t qaddr; struct tte *tp; pa = VM_PAGE_TO_PHYS(m); if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa)) return (TLB_PHYS_TO_DIRECT(pa)); critical_enter(); qaddr = PCPU_GET(qmap_addr); qaddr += (PAGE_SIZE * ((DCACHE_COLORS + DCACHE_COLOR(pa) - DCACHE_COLOR(qaddr)) % DCACHE_COLORS)); tp = tsb_kvtotte(qaddr); KASSERT(tp->tte_data == 0, ("pmap_quick_enter_page: PTE busy")); tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W; tp->tte_vpn = TV_VPN(qaddr, TS_8K); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; struct tte *tp; if (addr >= VM_MIN_DIRECT_ADDRESS) return; tp = tsb_kvtotte(addr); qaddr = PCPU_GET(qmap_addr); KASSERT((addr >= qaddr) && (addr < (qaddr + (PAGE_SIZE * DCACHE_COLORS))), ("pmap_quick_remove_page: invalid address")); KASSERT(tp->tte_data != 0, ("pmap_quick_remove_page: PTE not in use")); stxa(TLB_DEMAP_VA(addr) | TLB_DEMAP_NUCLEUS | TLB_DEMAP_PAGE, ASI_DMMU_DEMAP, 0); stxa(TLB_DEMAP_VA(addr) | TLB_DEMAP_NUCLEUS | TLB_DEMAP_PAGE, ASI_IMMU_DEMAP, 0); flush(KERNBASE); TTE_ZERO(tp); critical_exit(); } int unmapped_buf_allowed; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { panic("pmap_copy_pages: not implemented"); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pm, vm_page_t m) { struct tte *tp; int loops; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); loops = 0; rv = FALSE; rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; if (TTE_GET_PMAP(tp) == pm) { rv = TRUE; break; } if (++loops >= 16) break; } rw_wunlock(&tte_list_global_lock); return (rv); } /* * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct tte *tp; int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) if ((tp->tte_data & (TD_PV | TD_WIRED)) == (TD_PV | TD_WIRED)) count++; rw_wunlock(&tte_list_global_lock); return (count); } /* * Remove all pages from specified address space, this aids process exit * speeds. This is much faster than pmap_remove in the case of running down * an entire address space. Only works for the current pmap. */ void pmap_remove_pages(pmap_t pm) { } /* * Returns TRUE if the given page has a managed mapping. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct tte *tp; boolean_t rv; rv = FALSE; if ((m->oflags & VPO_UNMANAGED) != 0) return (rv); rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) if ((tp->tte_data & TD_PV) != 0) { rv = TRUE; break; } rw_wunlock(&tte_list_global_lock); return (rv); } /* * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct tte *tpf; struct tte *tpn; struct tte *tp; u_long data; int count; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); count = 0; rw_wlock(&tte_list_global_lock); if ((tp = TAILQ_FIRST(&m->md.tte_list)) != NULL) { tpf = tp; do { tpn = TAILQ_NEXT(tp, tte_link); TAILQ_REMOVE(&m->md.tte_list, tp, tte_link); TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link); if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_REF); if ((data & TD_W) != 0) vm_page_dirty(m); if ((data & TD_REF) != 0 && ++count >= PMAP_TS_REFERENCED_MAX) break; } while ((tp = tpn) != NULL && tp != tpf); } rw_wunlock(&tte_list_global_lock); return (count); } boolean_t pmap_is_modified(vm_page_t m) { struct tte *tp; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); rv = FALSE; /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no TTEs can have TD_W set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (rv); rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; if ((tp->tte_data & TD_W) != 0) { rv = TRUE; break; } } rw_wunlock(&tte_list_global_lock); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { boolean_t rv; PMAP_LOCK(pmap); rv = tsb_tte_lookup(pmap, addr) == NULL; PMAP_UNLOCK(pmap); return (rv); } /* * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { struct tte *tp; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rv = FALSE; rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; if ((tp->tte_data & TD_REF) != 0) { rv = TRUE; break; } } rw_wunlock(&tte_list_global_lock); return (rv); } /* * This function is advisory. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } void pmap_clear_modify(vm_page_t m) { struct tte *tp; u_long data; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no TTEs can have TD_W set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ - if ((vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_W); if ((data & TD_W) != 0) tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } rw_wunlock(&tte_list_global_lock); } void pmap_remove_write(vm_page_t m) { struct tte *tp; u_long data; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (vm_page_aflags(m) & PGA_WRITEABLE) == 0) + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&tte_list_global_lock); TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) { if ((tp->tte_data & TD_PV) == 0) continue; data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W); if ((data & TD_W) != 0) { vm_page_dirty(m); tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp)); } } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&tte_list_global_lock); } int pmap_mincore(pmap_t pm, vm_offset_t addr, vm_paddr_t *locked_pa) { /* TODO; */ return (0); } /* * Activate a user pmap. The pmap must be activated before its address space * can be accessed in any way. */ void pmap_activate(struct thread *td) { struct vmspace *vm; struct pmap *pm; int context; critical_enter(); vm = td->td_proc->p_vmspace; pm = vmspace_pmap(vm); context = PCPU_GET(tlb_ctx); if (context == PCPU_GET(tlb_ctx_max)) { tlb_flush_user(); context = PCPU_GET(tlb_ctx_min); } PCPU_SET(tlb_ctx, context + 1); pm->pm_context[curcpu] = context; #ifdef SMP CPU_SET_ATOMIC(PCPU_GET(cpuid), &pm->pm_active); atomic_store_acq_ptr((uintptr_t *)PCPU_PTR(pmap), (uintptr_t)pm); #else CPU_SET(PCPU_GET(cpuid), &pm->pm_active); PCPU_SET(pmap, pm); #endif stxa(AA_DMMU_TSB, ASI_DMMU, pm->pm_tsb); stxa(AA_IMMU_TSB, ASI_IMMU, pm->pm_tsb); stxa(AA_DMMU_PCXR, ASI_DMMU, (ldxa(AA_DMMU_PCXR, ASI_DMMU) & TLB_CXR_PGSZ_MASK) | context); flush(KERNBASE); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode == VM_MEMATTR_DEFAULT); } Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 352406) +++ head/sys/vm/swap_pager.c (revision 352407) @@ -1,3019 +1,3025 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64. * The 64-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 32 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif #define SWAP_META_PAGES PCTRIE_COUNT /* * A swblk structure maps each page index within a * SWAP_META_PAGES-aligned and sized range to the address of an * on-disk swap block (or SWAPBLK_NONE). The collection of these * mappings for an entire vm object is implemented as a pc-trie. */ struct swblk { vm_pindex_t p; daddr_t d[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static u_long swap_reserved; static u_long swap_total; static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_total, 0, sysctl_page_shift, "A", "Total amount of available swap storage."); static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) static int sysctl_page_shift(SYSCTL_HANDLER_ARGS) { uint64_t newval; u_long value = *(u_long *)arg1; newval = ((uint64_t)value) << PAGE_SHIFT; return (sysctl_handle_64(oidp, &newval, 0, req)); } int swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { u_long r, s, prev, pincr; int res, error; static int curfail; static struct timeval lastfail; struct uidinfo *uip; uip = cred->cr_ruidinfo; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (0); } #endif pincr = atop(incr); res = 0; prev = atomic_fetchadd_long(&swap_reserved, pincr); r = prev + pincr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; } else { prev = atomic_fetchadd_long(&swap_reserved, -pincr); if (prev < pincr) panic("swap_reserved < incr on overcommit fail"); } if (res) { prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) { res = 0; prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); if (prev < pincr) panic("uip->ui_vmsize < incr on overcommit fail"); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", uip->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (racct_enable && !res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (res); } void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; u_long pincr; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); PROC_LOCK(curproc); #ifdef RACCT if (racct_enable) racct_add_force(curproc, RACCT_SWAP, incr); #endif pincr = atop(incr); atomic_add_long(&swap_reserved, pincr); uip = curproc->p_ucred->cr_ruidinfo; atomic_add_long(&uip->ui_vmsize, pincr); PROC_UNLOCK(curproc); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { u_long prev, pdecr; struct uidinfo *uip; uip = cred->cr_ruidinfo; KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, (uintmax_t)decr)); pdecr = atop(decr); prev = atomic_fetchadd_long(&swap_reserved, -pdecr); if (prev < pdecr) panic("swap_reserved < decr"); prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); if (prev < pdecr) printf("negative vmsize for uid = %d\n", uip->ui_uid); #ifdef RACCT if (racct_enable) racct_sub_cred(cred, RACCT_SWAP, decr); #endif } #define SWM_POP 0x01 /* pop out */ static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", "Swap Fragmentation Info"); static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swwbuf_zone; static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ .pgo_update_writecount = swap_pager_update_writecount, .pgo_release_writecount = swap_pager_release_writecount, }; /* * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0, "Maximum size of a swap block in pages"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); static daddr_t swp_pager_getswapspace(int *npages, int limit); /* * Metadata functions */ static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); static void swp_pager_init_freerange(daddr_t *start, daddr_t *num) { *start = SWAPBLK_NONE; *num = 0; } static void swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) { if (*start + *num == addr) { (*num)++; } else { swp_pager_freeswapspace(*start, *num); *start = addr; *num = 1; } } static void * swblk_trie_alloc(struct pctrie *ptree) { return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0))); } static void swblk_trie_free(struct pctrie *ptree, void *node) { uma_zfree(swpctrie_zone, node); } PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array, which has MAXPHYS / PAGE_SIZE entries, and our locally * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); /* * Initialize our zone, taking the user's requested size or * estimating the number we need based on the number of pages * in the system. */ n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : vm_cnt.v_page_count / 2; swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; do { if (uma_zone_reserve_kva(swblk_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); /* * Often uma_zone_reserve_kva() cannot reserve exactly the * requested size. Account for the difference when * calculating swap_maxpages. */ n = uma_zone_get_max(swblk_zone); if (n < n2) printf("Swap blk zone entries changed from %lu to %lu.\n", n2, n); swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblk); if (!uma_zone_reserve_kva(swpctrie_zone, n)) printf("Cannot reserve swap pctrie zone, " "reduce kern.maxswzone.\n"); } static vm_object_t swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } /* * The un_pager.swp.swp_blks trie is initialized by * vm_object_allocate() to ensure the correct order of * visibility to other threads. */ object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK + size)); object->un_pager.swp.writemappings = 0; object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for up to the requested number of pages, and at * least a minimum number of pages. The starting swap block number * (a page index) is returned or SWAPBLK_NONE if the allocation * failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int *io_npages, int limit) { daddr_t blk; struct swdevt *sp; int mpages, npages; blk = SWAPBLK_NONE; mpages = *io_npages; npages = imin(BLIST_MAX_ALLOC, mpages); mtx_lock(&sw_dev_mtx); sp = swdevhd; while (!TAILQ_EMPTY(&swtailq)) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if ((sp->sw_flags & SW_CLOSING) == 0) blk = blist_alloc(sp->sw_blist, &npages, mpages); if (blk != SWAPBLK_NONE) break; sp = TAILQ_NEXT(sp, sw_list); if (swdevhd == sp) { if (npages <= limit) break; mpages = npages - 1; npages >>= 1; } } if (blk != SWAPBLK_NONE) { *io_npages = npages; blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { if (swap_pager_full != 2) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; } mtx_unlock(&sw_dev_mtx); return (blk); } static bool swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(bp->b_blkno, sp)) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages) { struct swdevt *sp; if (npages == 0) return; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(blk, sp)) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats */ static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct swdevt *sp; const char *devname; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); blist_stats(sp->sw_blist, &sbuf); } mtx_unlock(&sw_dev_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { daddr_t addr, blk, n_free, s_free; int i, j, n; swp_pager_init_freerange(&s_free, &n_free); VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += n) { n = size - i; blk = swp_pager_getswapspace(&n, 1); if (blk == SWAPBLK_NONE) { swp_pager_meta_free(object, start, i); VM_OBJECT_WUNLOCK(object); return (-1); } for (j = 0; j < n; ++j) { addr = swp_pager_meta_build(object, start + i + j, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); } } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WUNLOCK(object); return (0); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { vm_pindex_t i; daddr_t dstaddr, n_free, s_free, srcaddr; VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && srcobject->handle != NULL) { vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } /* * Transfer source to destination. */ swp_pager_init_freerange(&s_free, &n_free); for (i = 0; i < dstobject->size; ++i) { srcaddr = swp_pager_meta_ctl(srcobject, i + offset, SWM_POP); if (srcaddr == SWAPBLK_NONE) continue; dstaddr = swp_pager_meta_ctl(dstobject, i, 0); if (dstaddr != SWAPBLK_NONE) { /* * Destination has valid swapblk or it is represented * by a resident page. We destroy the source block. */ swp_pager_update_freerange(&s_free, &n_free, srcaddr); continue; } /* * Destination has no swapblk and is not resident, * copy source. * * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); dstaddr = swp_pager_meta_build(dstobject, i, srcaddr); KASSERT(dstaddr == SWAPBLK_NONE, ("Unexpected destination swapblk")); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } swp_pager_freeswapspace(s_free, n_free); /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page must be locked. */ static void swap_pager_unswapped(vm_page_t m) { daddr_t srcaddr; srcaddr = swp_pager_meta_ctl(m->object, m->pindex, SWM_POP); if (srcaddr != SWAPBLK_NONE) swp_pager_freeswapspace(srcaddr, 1); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "ma" of length "count". The * caller may optionally specify that additional pages preceding and * succeeding the specified range be paged in. The number of such pages * is returned in the "rbehind" and "rahead" parameters, and they will * be in the inactive queue upon return. * * The pages in "ma" must be busied and will remain busied upon return. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t bm, mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, maxahead, maxbehind, reqcount; reqcount = count; /* * Determine the final number of read-behind pages and * allocate them BEFORE releasing the object lock. Otherwise, * there can be a problematic race with vm_object_split(). * Specifically, vm_object_split() might first transfer pages * that precede ma[0] in the current object to a new object, * and then this function incorrectly recreates those pages as * read-behind pages in the current object. */ if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) return (VM_PAGER_FAIL); /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = ma[reqcount - 1]->pindex; msucc = TAILQ_NEXT(ma[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = ma[0]->pindex; mpred = TAILQ_PREV(ma[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } bm = ma[0]; for (i = 0; i < count; i++) ma[i]->oflags |= VPO_SWAPINPROG; /* * Allocate readahead and readbehind pages. */ if (rbehind != NULL) { for (i = 1; i <= *rbehind; i++) { p = vm_page_alloc(object, ma[0]->pindex - i, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; bm = p; } *rbehind = i - 1; } if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); pindex = bm->pindex; blk = swp_pager_meta_ctl(object, pindex, 0); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swrbuf_zone, M_WAITOK); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); bp->b_pages[i] = p; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; VM_CNT_INC(v_swapin); VM_CNT_ADD(v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our ma[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { ma[0]->oflags |= VPO_SWAPSLEEP; VM_CNT_INC(v_intrans); if (VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (ma[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, ma, count, rbehind, rahead); VM_OBJECT_WUNLOCK(object); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, ma, count, error); VM_OBJECT_WLOCK(object); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whose rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { struct buf *bp; daddr_t addr, blk, n_free, s_free; vm_page_t mreq; int i, j, n; bool async; KASSERT(count == 0 || ma[0]->object == object, ("%s: object mismatch %p/%p", __func__, object, ma[0]->object)); /* * Step 1 * * Turn object into OBJT_SWAP. Force sync if not a pageout process. */ if (object->type != OBJT_SWAP) { addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE); KASSERT(addr == SWAPBLK_NONE, ("unexpected object swap block")); } VM_OBJECT_WUNLOCK(object); async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; swp_pager_init_freerange(&s_free, &n_free); /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { /* Maximum I/O size is limited by maximum swap block size. */ n = min(count - i, nsw_cluster_max); /* Get a block of swap of size up to size n. */ blk = swp_pager_getswapspace(&n, 4); if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) rtvals[i + j] = VM_PAGER_FAIL; continue; } /* * All I/O parameters have been satisfied. Build the I/O * request and assign the swap space. */ if (async) { mtx_lock(&swbuf_mtx); while (nsw_wcount_async == 0) msleep(&nsw_wcount_async, &swbuf_mtx, PVM, "swbufa", 0); nsw_wcount_async--; mtx_unlock(&swbuf_mtx); } bp = uma_zalloc(swwbuf_zone, M_WAITOK); if (async) bp->b_flags = B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_WLOCK(object); for (j = 0; j < n; ++j) { mreq = ma[i + j]; addr = swp_pager_meta_build(mreq->object, mreq->pindex, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_WUNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; VM_CNT_INC(v_swapout); VM_CNT_ADD(v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ if (async) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * Report error - unless we ran out of memory, in which case * we've already logged it in swapgeom_strategy(). */ if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->handle); } if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ MPASS(m->dirty == VM_PAGE_BITS_ALL); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); m->valid = VM_PAGE_BITS_ALL; if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). * A page is only written to swap after a period of * inactivity. Therefore, we do not expect it to be * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); vm_page_lock(m); vm_page_deactivate_noreuse(m); vm_page_unlock(m); vm_page_sunbusy(m); } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ if (bp->b_flags & B_ASYNC) { mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); } uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int swap_pager_nswapdev(void) { return (nswapdev); } static void swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); +#ifdef INVARIANTS + vm_page_lock(m); + if (!vm_page_wired(m) && m->queue == PQ_NONE) + panic("page %p is neither wired nor queued", m); + vm_page_unlock(m); +#endif vm_page_xunbusy(m); swap_pager_unswapped(m); } static void swp_pager_force_launder(vm_page_t m) { vm_page_dirty(m); vm_page_lock(m); vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); swap_pager_unswapped(m); } /* * SWP_PAGER_FORCE_PAGEIN() - force swap blocks to be paged in * * This routine dissociates pages starting at the given index within an * object from their backing store, paging them in if they do not reside * in memory. Pages that are paged in are marked dirty and placed in the * laundry queue. Pages are marked dirty because they no longer have * backing store. They are placed in the laundry queue because they have * not been accessed recently. Otherwise, they would already reside in * memory. */ static void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex, int npages) { vm_page_t ma[npages]; int i, j; KASSERT(npages > 0, ("%s: No pages", __func__)); KASSERT(npages <= MAXPHYS / PAGE_SIZE, ("%s: Too many pages: %d", __func__, npages)); vm_object_pip_add(object, npages); vm_page_grab_pages(object, pindex, VM_ALLOC_NORMAL, ma, npages); for (i = j = 0;; i++) { /* Count nonresident pages, to page-in all at once. */ if (i < npages && ma[i]->valid != VM_PAGE_BITS_ALL) continue; if (j < i) { /* Page-in nonresident pages. Mark for laundering. */ if (swap_pager_getpages(object, &ma[j], i - j, NULL, NULL) != VM_PAGER_OK) panic("%s: read from swap failed", __func__); do { swp_pager_force_launder(ma[j]); } while (++j < i); } if (i == npages) break; /* Mark dirty a resident page. */ swp_pager_force_dirty(ma[j++]); } vm_object_pip_wakeupn(object, npages); } /* * swap_pager_swapoff_object: * * Page in all of the pages that have been paged out for an object * to a swap device. */ static void swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) { struct swblk *sb; vm_pindex_t pi, s_pindex; daddr_t blk, n_blks, s_blk; int i; n_blks = 0; for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi)) != NULL; ) { for (i = 0; i < SWAP_META_PAGES; i++) { blk = sb->d[i]; if (!swp_pager_isondev(blk, sp)) blk = SWAPBLK_NONE; /* * If there are no blocks/pages accumulated, start a new * accumulation here. */ if (n_blks == 0) { if (blk != SWAPBLK_NONE) { s_blk = blk; s_pindex = sb->p + i; n_blks = 1; } continue; } /* * If the accumulation can be extended without breaking * the sequence of consecutive blocks and pages that * swp_pager_force_pagein() depends on, do so. */ if (n_blks < MAXPHYS / PAGE_SIZE && s_blk + n_blks == blk && s_pindex + n_blks == sb->p + i) { ++n_blks; continue; } /* * The sequence of consecutive blocks and pages cannot * be extended, so page them all in here. Then, * because doing so involves releasing and reacquiring * a lock that protects the swap block pctrie, do not * rely on the current swap block. Break this loop and * re-fetch the same pindex from the pctrie again. */ swp_pager_force_pagein(object, s_pindex, n_blks); n_blks = 0; break; } if (i == SWAP_META_PAGES) pi = sb->p + SWAP_META_PAGES; } if (n_blks > 0) swp_pager_force_pagein(object, s_pindex, n_blks); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { vm_object_t object; int retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; full_rescan: mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->type != OBJT_SWAP) continue; mtx_unlock(&vm_object_list_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); /* * Dead objects are eventually terminated on their own. */ if ((object->flags & OBJ_DEAD) != 0) goto next_obj; /* * Sync with fences placed after pctrie * initialization. We must not access pctrie below * unless we checked that our object is swap and not * dead. */ atomic_thread_fence_acq(); if (object->type != OBJT_SWAP) goto next_obj; swap_pager_swapoff_object(sp, object); next_obj: VM_OBJECT_WUNLOCK(object); mtx_lock(&vm_object_list_mtx); } mtx_unlock(&vm_object_list_mtx); if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } EVENTHANDLER_INVOKE(swapoff, sp); } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free? */ static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit) { int i; MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES); for (i = start; i < limit; i++) { if (sb->d[i] != SWAPBLK_NONE) return (false); } return (true); } /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is returned. */ static daddr_t swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; struct swblk *sb, *sb1; vm_pindex_t modpi, rdpi; daddr_t prev_swapblk; int error, i; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff()'s iteration over * object_list does not see a garbage pctrie. */ atomic_thread_fence_rel(); object->type = OBJT_SWAP; object->un_pager.swp.writemappings = 0; KASSERT(object->handle == NULL, ("default pager with handle")); } rdpi = rounddown(pindex, SWAP_META_PAGES); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb == NULL) { if (swapblk == SWAPBLK_NONE) return (SWAPBLK_NONE); for (;;) { sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (sb != NULL) { sb->p = rdpi; for (i = 0; i < SWAP_META_PAGES; i++) sb->d[i] = SWAPBLK_NONE; if (atomic_cmpset_int(&swblk_zone_exhausted, 1, 0)) printf("swblk zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swblk_zone)) { if (atomic_cmpset_int(&swblk_zone_exhausted, 0, 1)) printf("swap blk zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxb", 10); } else uma_zwait(swblk_zone); VM_OBJECT_WLOCK(object); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb != NULL) /* * Somebody swapped out a nearby page, * allocating swblk at the rdpi index, * while we dropped the object lock. */ goto allocated; } for (;;) { error = SWAP_PCTRIE_INSERT( &object->un_pager.swp.swp_blks, sb); if (error == 0) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 1, 0)) printf("swpctrie zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swpctrie_zone)) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 0, 1)) printf("swap pctrie zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxp", 10); } else uma_zwait(swpctrie_zone); VM_OBJECT_WLOCK(object); sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb1 != NULL) { uma_zfree(swblk_zone, sb); sb = sb1; goto allocated; } } } allocated: MPASS(sb->p == rdpi); modpi = pindex % SWAP_META_PAGES; /* Return prior contents of metadata. */ prev_swapblk = sb->d[modpi]; /* Enter block into metadata. */ sb->d[modpi] = swapblk; /* * Free the swblk if we end up with the empty page run. */ if (swapblk == SWAPBLK_NONE && swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, rdpi); uma_zfree(swblk_zone, sb); } return (prev_swapblk); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t last; int i, limit, start; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP || count == 0) return; swp_pager_init_freerange(&s_free, &n_free); last = pindex + count; for (;;) { sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL || sb->p >= last) break; start = pindex > sb->p ? pindex - sb->p : 0; limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : SWAP_META_PAGES; for (i = start; i < limit; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); sb->d[i] = SWAPBLK_NONE; } pindex = sb->p + SWAP_META_PAGES; if (swp_pager_swblk_empty(sb, 0, start) && swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t pindex; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return; swp_pager_init_freerange(&s_free, &n_free); for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pindex)) != NULL;) { pindex = sb->p + SWAP_META_PAGES; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_METACTL() - misc control of swap meta data. * * This routine is capable of looking up, or removing swapblk * assignments in the swap meta data. It returns the swapblk being * looked-up, popped, or SWAPBLK_NONE if the block was invalid. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. * * SWM_POP remove from meta data but do not free it */ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { struct swblk *sb; daddr_t r1; if ((flags & SWM_POP) != 0) VM_OBJECT_ASSERT_WLOCKED(object); else VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (SWAPBLK_NONE); r1 = sb->d[pindex % SWAP_META_PAGES]; if (r1 == SWAPBLK_NONE) return (SWAPBLK_NONE); if ((flags & SWM_POP) != 0) { sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE; if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); uma_zfree(swblk_zone, sb); } } return (r1); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; int i; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP) return (object->size); sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); if (sb->p < pindex) { for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, roundup(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); } for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } /* * We get here if a swblk is present in the trie but it * doesn't map any blocks. */ MPASS(0); return (object->size); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swblk_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message. */ static void swapon_check_swzone(void) { unsigned long maxpages, npages; npages = swap_total; /* absolute maximum we can handle assuming 100% efficiency */ maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES; /* recommend using no more than half that amount */ if (npages > maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", npages, maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); } } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf( "WARNING: reducing swap size to maximum of %luMB per unit\n", mblocks / 1024 / 1024 * PAGE_SIZE); nblks = mblocks; } sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* * Do not free the first blocks in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE), nblks - howmany(BBSIZE, PAGE_SIZE)); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE); swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { if (vn_isdisk(sp->sw_vp, NULL)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } #if defined(COMPAT_FREEBSD11) #define XSWDEV_VERSION_11 1 struct xswdev11 { u_int xsw_version; uint32_t xsw_dev; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 { u_int xsw_version; u_int xsw_dev1, xsw_dev2; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 xs32; #endif #if defined(COMPAT_FREEBSD11) struct xswdev11 xs11; #endif int error; if (arg2 != 1) /* name length */ return (EINVAL); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_FREEBSD32) if (req->oldlen == sizeof(xs32)) { xs32.xsw_version = XSWDEV_VERSION; xs32.xsw_dev1 = xs.xsw_dev; xs32.xsw_dev2 = xs.xsw_dev >> 32; xs32.xsw_flags = xs.xsw_flags; xs32.xsw_nblks = xs.xsw_nblks; xs32.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs32, sizeof(xs32)); return (error); } #endif #if defined(COMPAT_FREEBSD11) if (req->oldlen == sizeof(xs11)) { xs11.xsw_version = XSWDEV_VERSION_11; xs11.xsw_dev = xs.xsw_dev; /* truncation */ xs11.xsw_flags = xs.xsw_flags; xs11.xsw_nblks = xs.xsw_nblks; xs11.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs11, sizeof(xs11)); return (error); } #endif error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * Count the approximate swap usage in pages for a vmspace. The * shadowed or not yet copied on write swap blocks are not accounted. * The map must be locked. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; struct swblk *sb; vm_pindex_t e, pi; long count; int i; map = &vmspace->vm_map; count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; object = cur->object.vm_object; if (object == NULL || object->type != OBJT_SWAP) continue; VM_OBJECT_RLOCK(object); if (object->type != OBJT_SWAP) goto unlock; pi = OFF_TO_IDX(cur->offset); e = pi + OFF_TO_IDX(cur->end - cur->start); for (;; pi = sb->p + SWAP_META_PAGES) { sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi); if (sb == NULL || sb->p >= e) break; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->p + i < e && sb->d[i] != SWAPBLK_NONE) count++; } } unlock: VM_OBJECT_RUNLOCK(object); } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; printf("swap_pager: cannot allocate bio\n"); bufdone(bp); return; } bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp, 0); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp, 0); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&swbuf_mtx); return (0); } static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_NOSPLIT) != 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_NOSPLIT) != 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 352406) +++ head/sys/vm/vm_fault.c (revision 352407) @@ -1,1833 +1,1839 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { vm_page_t m; vm_object_t object; vm_pindex_t pindex; vm_page_t first_m; vm_object_t first_object; vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; struct vnode *vp; }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked); static int vm_pfault_oom_attempts = 3; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, &vm_pfault_oom_attempts, 0, "Number of page allocation attempts in page fault handler before it " "triggers OOM handling"); static int vm_pfault_oom_wait = 10; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, &vm_pfault_oom_wait, 0, "Number of seconds to wait for free pages before retrying " "the page fault handler"); static inline void release_page(struct faultstate *fs) { vm_page_xunbusy(fs->m); + vm_page_lock(fs->m); vm_page_deactivate(fs->m); + vm_page_unlock(fs->m); fs->m = NULL; } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void unlock_and_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); VM_OBJECT_WUNLOCK(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); vm_page_free(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot, vm_prot_t fault_type, int fault_flags, bool set_wd) { bool need_dirty; if (((prot & VM_PROT_WRITE) == 0 && (fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_OBJECT_ASSERT_LOCKED(m->object); need_dirty = ((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_WIRE) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0; if (set_wd) vm_object_set_writeable_dirty(m->object); else /* * If two callers of vm_fault_dirty() with set_wd == * FALSE, one for the map entry with MAP_ENTRY_NOSYNC * flag set, other with flag clear, race, it is * possible for the no-NOSYNC thread to see m->dirty * != 0 and not clear VPO_NOSYNC. Take vm_page lock * around manipulation of VPO_NOSYNC and * vm_page_dirty() call, to avoid the race and keep * m->oflags consistent. */ vm_page_lock(m); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) { if (m->dirty == 0) { m->oflags |= VPO_NOSYNC; } } else { m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also, since the page is now dirty, we can possibly tell * the pager to release any swap backing the page. Calling * the pager requires a write lock on the object. */ if (need_dirty) vm_page_dirty(m); if (!set_wd) vm_page_unlock(m); else if (need_dirty) vm_pager_page_unswapped(m); } /* * Unlocks fs.first_object and fs.map on success. */ static int vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m, m_map; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 vm_page_t m_super; int flags; #endif int psind, rv; MPASS(fs->vp == NULL); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) return (KERN_FAILURE); m_map = m; psind = 0; #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv)) && \ VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && (m_super = vm_reserv_to_superpage(m)) != NULL && rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & (pagesizes[m_super->psind] - 1)) && !wired && pmap_ps_enabled(fs->map->pmap)) { flags = PS_ALL_VALID; if ((prot & VM_PROT_WRITE) != 0) { /* * Create a superpage mapping allowing write access * only if none of the constituent pages are busy and * all of them are already dirty (except possibly for * the page that was faulted on). */ flags |= PS_NONE_BUSY; if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) flags |= PS_ALL_DIRTY; } if (vm_page_ps_test(m_super, flags, m)) { m_map = m_super; psind = m_super->psind; vaddr = rounddown2(vaddr, pagesizes[psind]); /* Preset the modified bit for dirty superpages. */ if ((flags & PS_ALL_DIRTY) != 0) fault_type |= VM_PROT_WRITE; } } #endif rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type | PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind); if (rv != KERN_SUCCESS) return (rv); if (m_hold != NULL) { *m_hold = m; vm_page_wire(m); } vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false); if (psind == 0 && !wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); VM_OBJECT_RUNLOCK(fs->first_object); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(REFCOUNT_COUNT(fs->first_object->paging_in_progress) > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(m->valid == VM_PAGE_BITS_ALL); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); + vm_page_lock(m); vm_page_deactivate(m); + vm_page_unlock(m); vm_page_xunbusy(m); } } static int vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { struct mtx *m_mtx; vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(REFCOUNT_COUNT(fs->first_object->paging_in_progress) > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ return (KERN_NOT_RECEIVER); } if (rv != VM_PAGER_OK) return (KERN_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ } /* * The map is unchanged after our last unlock. Process the fault. * * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || !pmap_ps_enabled(fs->map->pmap) || wired)) psind = 0; #else psind = 0; #endif npages = atop(pagesizes[psind]); for (i = 0; i < npages; i++) { vm_fault_populate_check_page(&m[i]); vm_fault_dirty(fs->entry, &m[i], prot, fault_type, fault_flags, true); } VM_OBJECT_WUNLOCK(fs->first_object); rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), psind); #if defined(__amd64__) if (psind > 0 && rv == KERN_FAILURE) { for (i = 0; i < npages; i++) { rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), &m[i], prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); MPASS(rv == KERN_SUCCESS); } } #else MPASS(rv == KERN_SUCCESS); #endif VM_OBJECT_WLOCK(fs->first_object); m_mtx = NULL; for (i = 0; i < npages; i++) { if ((fault_flags & VM_FAULT_WIRE) != 0) { vm_page_wire(&m[i]); } else { vm_page_change_lock(&m[i], &m_mtx); vm_page_activate(&m[i]); } if (m_hold != NULL && m[i].pindex == fs->first_pindex) { *m_hold = &m[i]; vm_page_wire(&m[i]); } vm_page_xunbusy(&m[i]); } if (m_mtx != NULL) mtx_unlock(m_mtx); } curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } /* * vm_fault: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) { struct thread *td; int result; td = curthread; if ((td->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags, NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND)) ktrfaultend(result); #endif return (result); } int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; struct vnode *vp; struct domainset *dset; vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; vm_pindex_t retry_pindex; vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; int locked, nera, oom, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, hardfault, is_first_object_locked; VM_CNT_INC(v_vm_faults); fs.vp = NULL; faultcount = 0; nera = -1; hardfault = false; RetryFault: oom = 0; RetryFault_oom: /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type | VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { unlock_vp(&fs); return (result); } fs.map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("%s: fault on nofault entry, addr: %#lx", __func__, (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(&fs); fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0); if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); else KASSERT((fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are both (1) mapping an existing page from the top- * level object and (2) not having to mark that object as containing * dirty pages. Under these conditions, a read lock on the top-level * object suffices, allowing multiple page faults of a similar type to * run in parallel on the same top-level object. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 && /* avoid calling vm_object_set_writeable_dirty() */ ((prot & VM_PROT_WRITE) == 0 || (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) { VM_OBJECT_RLOCK(fs.first_object); if ((prot & VM_PROT_WRITE) == 0 || (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) { rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type, fault_flags, wired, m_hold); if (rv == KERN_SUCCESS) return (rv); } if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = true; fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is marked for imminent termination, * we retry here, since the collapse pass has raced * with us. Otherwise, if we see terminally dead * object, return fail. */ if ((fs.object->flags & OBJ_DEAD) != 0) { dead = fs.object->type == OBJT_DEAD; unlock_and_deallocate(&fs); if (dead) return (KERN_PROTECTION_FAILURE); pause("vmf_de", 1); goto RetryFault; } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * Wait/Retry if the page is busy. We have to do this * if the page is either exclusive or shared busy * because the vm_pager may be using read busy for * pageouts (and even pageins if it is the vnode * pager), and we could end up trying to pagein and * pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a shared busied page except, perhaps, * to pmap it. */ if (vm_page_busied(fs.m)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_free(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_xbusy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; /* break to PAGE HAS BEEN FOUND */ } KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m)); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. (Default objects are zero-fill, so there is no real * pager for them.) */ if (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } if (fs.object == fs.first_object && (fs.first_object->flags & OBJ_POPULATE) != 0 && fs.first_object->shadow_count == 0) { rv = vm_fault_populate(&fs, prot, fault_type, fault_flags, wired, m_hold); switch (rv) { case KERN_SUCCESS: case KERN_FAILURE: unlock_and_deallocate(&fs); return (rv); case KERN_RESOURCE_SHORTAGE: unlock_and_deallocate(&fs); goto RetryFault; case KERN_NOT_RECEIVER: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ dset = fs.object->domain.dr_policy; if (dset == NULL) dset = curthread->td_domain.dr_policy; if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs.object, atop(vaddr) - fs.pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); if (vm_pfault_oom_attempts < 0 || oom < vm_pfault_oom_attempts) { oom++; vm_waitpfault(dset, vm_pfault_oom_wait * hz); goto RetryFault_oom; } if (bootverbose) printf( "proc %d (%s) failed to alloc page on fault, starting OOM\n", curproc->p_pid, curproc->p_comm); vm_pageout_oom(VM_OOM_MEM_PF); goto RetryFault; } } readrest: /* * At this point, we have either allocated a new page or found * an existing page that is only partially valid. * * We hold a reference on the current object and the page is * exclusive busied. */ /* * If the pager for the current object might have the page, * then determine the number of additional pages to read and * potentially reprioritize previously read pages for earlier * reclamation. These operations should only be performed * once per page fault. Even if the current pager doesn't * have the page, the number of additional pages to read will * apply to subsequent objects in the shadow chain. */ if (fs.object->type != OBJT_DEFAULT && nera == -1 && !P_KILLED(curproc)) { KASSERT(fs.lookup_still_valid, ("map unlocked")); era = fs.entry->read_ahead; behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (vaddr == fs.entry->next_read) vm_fault_dontneed(&fs, vaddr, nera); } else if (vaddr == fs.entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(&fs, vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs.entry->read_ahead = nera; } /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs.entry->start; e_end = fs.entry->end; } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ if (fs.object->type != OBJT_DEFAULT) { /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE && (vp = fs.object->handle) != fs.vp) { /* * Perform an unlock in case the desired vnode * changed while the map was unlocked during a * retry. */ unlock_vp(&fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock * while we have the page exclusive busied or * the object's paging-in-progress count * incremented. Otherwise, we could deadlock. */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (nera > 0) { behind = 0; ahead = nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs.pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - vaddr) - 1); } rv = vm_pager_get_pages(fs.object, &fs.m, 1, &behind, &ahead); if (rv == VM_PAGER_OK) { faultcount = behind + 1 + ahead; hardfault = true; break; /* break to PAGE HAS BEEN FOUND */ } if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { if (!vm_page_wired(fs.m)) vm_page_free(fs.m); else vm_page_xunbusy(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return (rv == VM_PAGER_ERROR ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs.object != fs.first_object) { if (!vm_page_wired(fs.m)) vm_page_free(fs.m); else vm_page_xunbusy(fs.m); fs.m = NULL; } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { VM_CNT_INC(v_ozfod); } VM_CNT_INC(v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } vm_page_assert_xbusied(fs.m); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { (void)vm_page_remove(fs.m); vm_page_replace_checked(fs.m, fs.first_object, fs.first_pindex, fs.first_m); vm_page_free(fs.first_m); vm_page_dirty(fs.m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs.m, fs.first_object, fs.object, OFF_TO_IDX( fs.first_object->backing_object_offset)); #endif /* * Removing the page from the backing object * unbusied it. */ vm_page_xbusy(fs.m); fs.first_m = fs.m; fs.m = NULL; VM_CNT_INC(v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_WIRE) == 0) { vm_page_wire(fs.first_m); vm_page_unwire(fs.m, PQ_INACTIVE); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * We only try to prefault read-only mappings to the * neighboring pages when this copy-on-write fault is * a hard fault. In other cases, trying to prefault * is typically wasted effort. */ if (faultcount == 0) faultcount = 1; /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); VM_CNT_INC(v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = true; if (fs.map->timestamp != fs.map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; fault_type &= retry_prot; if (prot == 0) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* Reassert because wired may have changed. */ KASSERT(wired || (fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); } } /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); vm_page_assert_xbusied(fs.m); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 && wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR, false); VM_OBJECT_WLOCK(fs.object); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fault_flags & VM_FAULT_WIRE) != 0) { vm_page_wire(fs.m); } else { + vm_page_lock(fs.m); vm_page_activate(fs.m); + vm_page_unlock(fs.m); } if (m_hold != NULL) { *m_hold = fs.m; vm_page_wire(fs.m); } vm_page_xunbusy(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { VM_CNT_INC(v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object, object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_WLOCKED(object); first_object = fs->first_object; if (first_object != object) { if (!VM_OBJECT_TRYWLOCK(first_object)) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WLOCK(first_object); VM_OBJECT_WLOCK(object); } } /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (m->valid != VM_PAGE_BITS_ALL || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. */ vm_page_lock(m); if (!vm_page_inactive(m)) vm_page_deactivate(m); vm_page_unlock(m); } } } if (first_object != object) VM_OBJECT_WUNLOCK(first_object); } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; if (!obj_locked) VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); break; } if (m->valid == VM_PAGE_BITS_ALL && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); /* * Check for illegal addresses. */ if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. * * If vm_fault_disable_pagefaults() was called, * i.e., TDP_NOFAULTING is set, we must not sleep nor * acquire MD VM locks, which means we must not call * vm_fault_hold(). Some (out of tree) callers mark * too wide a code area with vm_fault_disable_pagefaults() * already, use the VM_PROT_QUICK_NOFAULT flag to request * the proper behaviour explicitly. */ if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && (curthread->td_pflags & TDP_NOFAULTING) != 0) goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault_hold(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) vm_page_unwire(*mp, PQ_INACTIVE); return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, atop(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif dst_object->domain = src_object->domain; dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if ((dst_object->type == OBJT_DEFAULT || dst_object->type == OBJT_SWAP) && dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object doesn't share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); vm_wait(dst_object); VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->dirty = dst_m->valid = src_m->valid; } else { dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; if (dst_m->pindex >= dst_object->size) /* * We are upgrading. Index can occur * out of bounds if the object type is * vnode and the file was truncated. */ break; vm_page_xbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. * * The page can be invalid if the user called * msync(MS_INVALIDATE) or truncated the backing vnode * or shared memory object. In this case, do not * insert it into pmap, but still do the copy so that * all copies of the wired map entry have similar * backing pages. */ if (dst_m->valid == VM_PAGE_BITS_ALL) { pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); } /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_unwire(src_m, PQ_INACTIVE); vm_page_wire(dst_m); } else { KASSERT(vm_page_wired(dst_m), ("dst_m %p is not wired", dst_m)); } } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 352406) +++ head/sys/vm/vm_mmap.c (revision 352407) @@ -1,1647 +1,1647 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 */ /* * Mapped file (mmap) interface to VM */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); static int mincore_mapped = 1; SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, "mincore reports mappings, not residency"); static int imply_prot_max = 0; SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, "Imply maximum page permissions in mmap() when none are specified"); #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif int sys_sbrk(struct thread *td, struct sbrk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif int sys_sstk(struct thread *td, struct sstk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) int ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) { td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int sys_mmap(struct thread *td, struct mmap_args *uap) { return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } int kern_mmap_maxprot(struct proc *p, int prot) { if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) return (_PROT_ALL); if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && prot != PROT_NONE) return (prot); return (_PROT_ALL); } int kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags, int fd, off_t pos) { struct vmspace *vms; struct file *fp; struct proc *p; vm_offset_t addr; vm_size_t pageoff, size; vm_prot_t cap_maxprot; int align, error, max_prot; cap_rights_t rights; if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) return (EINVAL); max_prot = PROT_MAX_EXTRACT(prot); prot = PROT_EXTRACT(prot); if (max_prot != 0 && (max_prot & prot) != prot) return (EINVAL); p = td->td_proc; /* * Always honor PROT_MAX if set. If not, default to all * permissions unless we're implying maximum permissions. */ if (max_prot == 0) max_prot = kern_mmap_maxprot(p, prot); vms = p->p_vmspace; fp = NULL; AUDIT_ARG_FD(fd); addr = addr0; /* * Ignore old flags that used to be defined but did not do anything. */ flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); /* * Enforce the constraints. * Mapping of length 0 is only allowed for old binaries. * Anonymous mapping shall specify -1 as filedescriptor and * zero position for new code. Be nice to ancient a.out * binaries and correct pos for anonymous mapping, since old * ld.so sometimes issues anonymous map requests with non-zero * pos. */ if (!SV_CURPROC_FLAG(SV_AOUT)) { if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) return (EINVAL); } else { if ((flags & MAP_ANON) != 0) pos = 0; } if (flags & MAP_STACK) { if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | MAP_GUARD | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) return (EINVAL); if (prot != PROT_NONE && (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) return (EINVAL); if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0)) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Compute size from len by rounding (on both ends). */ size = len + pageoff; /* low end... */ size = round_page(size); /* hi end */ /* Check for rounding up to zero. */ if (len > size) return (ENOMEM); /* Ensure alignment is at least a page and fits in a pointer. */ align = flags & MAP_ALIGNMENT_MASK; if (align != 0 && align != MAP_ALIGNED_SUPER && (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (addr < vm_map_min(&vms->vm_map) || addr + size > vm_map_max(&vms->vm_map)) return (EINVAL); if (addr + size < addr) return (EINVAL); #ifdef MAP_32BIT if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) return (EINVAL); } else if (flags & MAP_32BIT) { /* * For MAP_32BIT, override the hint if it is too high and * do not bother moving the mapping past the heap (since * the heap is usually above 2GB). */ if (addr + size > MAP_32BIT_MAX_ADDR) addr = 0; #endif } else { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)); } if (len == 0) { /* * Return success without mapping anything for old * binaries that request a page-aligned mapping of * length 0. For modern binaries, this function * returns an error earlier. */ error = 0; } else if ((flags & MAP_GUARD) != 0) { error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, VM_PROT_NONE, flags, NULL, pos, FALSE, td); } else if ((flags & MAP_ANON) != 0) { /* * Mapping blank space is trivial. * * This relies on VM_PROT_* matching PROT_*. */ error = vm_mmap_object(&vms->vm_map, &addr, size, prot, max_prot, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the * descriptor disappear on us if we block. Check capability * rights, but also return the maximum rights to be combined * with maxprot later. */ cap_rights_init(&rights, CAP_MMAP); if (prot & PROT_READ) cap_rights_set(&rights, CAP_MMAP_R); if ((flags & MAP_SHARED) != 0) { if (prot & PROT_WRITE) cap_rights_set(&rights, CAP_MMAP_W); } if (prot & PROT_EXEC) cap_rights_set(&rights, CAP_MMAP_X); error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); if (error != 0) goto done; if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && p->p_osrel >= P_OSREL_MAP_FSTRICT) { error = EINVAL; goto done; } /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, max_prot & cap_maxprot, flags, pos, td); } if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); done: if (fp) fdrop(fp, td); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) { return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } #endif #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(struct thread *td, struct ommap_args *uap) { static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; int flags, prot; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 prot = cvtbsdprot[uap->prot & 0x7]; #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && prot != 0) prot |= PROT_EXEC; #endif flags = 0; if (uap->flags & OMAP_ANON) flags |= MAP_ANON; if (uap->flags & OMAP_COPY) flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) flags |= MAP_SHARED; else flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) flags |= MAP_FIXED; return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, uap->fd, uap->pos)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; size_t len; int flags; }; #endif int sys_msync(struct thread *td, struct msync_args *uap) { return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); } int kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) { vm_offset_t addr; vm_size_t pageoff; vm_map_t map; int rv; addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &td->td_proc->p_vmspace->vm_map; /* * Clean the pages and interpret the return value. */ rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: return (ENOMEM); case KERN_INVALID_ARGUMENT: return (EBUSY); case KERN_FAILURE: return (EIO); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif int sys_munmap(struct thread *td, struct munmap_args *uap) { return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); } int kern_munmap(struct thread *td, uintptr_t addr0, size_t size) { #ifdef HWPMC_HOOKS struct pmckern_map_out pkm; vm_map_entry_t entry; bool pmc_handled; #endif vm_offset_t addr; vm_size_t pageoff; vm_map_t map; if (size == 0) return (EINVAL); addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... */ map = &td->td_proc->p_vmspace->vm_map; if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) return (EINVAL); vm_map_lock(map); #ifdef HWPMC_HOOKS pmc_handled = false; if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { pmc_handled = true; /* * Inform hwpmc if the address range being unmapped contains * an executable region. */ pkm.pm_address = (uintptr_t) NULL; if (vm_map_lookup_entry(map, addr, &entry)) { for (; entry->start < addr + size; entry = entry->next) { if (vm_map_check_protection(map, entry->start, entry->end, VM_PROT_EXECUTE) == TRUE) { pkm.pm_address = (uintptr_t) addr; pkm.pm_size = (size_t) size; break; } } } } #endif vm_map_delete(map, addr, addr + size); #ifdef HWPMC_HOOKS if (__predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); vm_map_unlock_read(map); } else #endif vm_map_unlock(map); /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ return (0); } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif int sys_mprotect(struct thread *td, struct mprotect_args *uap) { return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); } int kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) { vm_offset_t addr; vm_size_t pageoff; int vm_error, max_prot; addr = addr0; if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) return (EINVAL); max_prot = PROT_MAX_EXTRACT(prot); prot = PROT_EXTRACT(prot); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { if (((addr + size) & 0xffffffff) < addr) return (EINVAL); } else #endif if (addr + size < addr) return (EINVAL); vm_error = KERN_SUCCESS; if (max_prot != 0) { if ((max_prot & prot) != prot) return (EINVAL); vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, max_prot, TRUE); } if (vm_error == KERN_SUCCESS) vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, FALSE); switch (vm_error) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); case KERN_RESOURCE_SHORTAGE: return (ENOMEM); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif int sys_minherit(struct thread *td, struct minherit_args *uap) { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr + size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif int sys_madvise(struct thread *td, struct madvise_args *uap) { return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); } int kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) { vm_map_t map; vm_offset_t addr, end, start; int flags; /* * Check for our special case, advising the swap pager we are * "immortal." */ if (behav == MADV_PROTECT) { flags = PPROT_SET; return (kern_procctl(td, P_PID, td->td_proc->p_pid, PROC_SPROTECT, &flags)); } /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; addr = addr0; if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) return (EINVAL); if ((addr + len) < addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page(addr); end = round_page(addr + len); /* * vm_map_madvise() checks for illegal values of behav. */ return (vm_map_madvise(map, start, end, behav)); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif int sys_mincore(struct thread *td, struct mincore_args *uap) { return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); } int kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; int error = 0; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; vm_object_t object; vm_paddr_t locked_pa; vm_page_t m; vm_pindex_t pindex; int mincoreinfo; unsigned int timestamp; boolean_t locked; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page(addr0); end = addr + (vm_size_t)round_page(len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (ENOMEM); pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) { vm_map_unlock_read(map); return (ENOMEM); } /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for (current = entry; current->start < end; current = current->next) { /* * check for contiguity */ if (current->end < end && current->next->start > current->end) { vm_map_unlock_read(map); return (ENOMEM); } /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ object = NULL; locked_pa = 0; retry: m = NULL; mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); if (mincore_mapped) { /* * We only care about this pmap's * mapping of the page, if any. */ if (locked_pa != 0) { vm_page_unlock(PHYS_TO_VM_PAGE( locked_pa)); } } else if (locked_pa != 0) { /* * The page is mapped by this process but not * both accessed and modified. It is also * managed. Acquire the object lock so that * other mappings might be examined. */ m = PHYS_TO_VM_PAGE(locked_pa); if (m->object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = m->object; locked = VM_OBJECT_TRYWLOCK(object); vm_page_unlock(m); if (!locked) { VM_OBJECT_WLOCK(object); vm_page_lock(m); goto retry; } } else vm_page_unlock(m); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("mincore: page %p is mapped but invalid", m)); } else if (mincoreinfo == 0) { /* * The page is not mapped by this process. If * the object implements managed pages, then * determine if the page is resident so that * the mappings might be examined. */ if (current->object.vm_object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = current->object.vm_object; VM_OBJECT_WLOCK(object); } if (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP || object->type == OBJT_VNODE) { pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); if (m != NULL && m->valid == 0) m = NULL; if (m != NULL) mincoreinfo = MINCORE_INCORE; } } if (m != NULL) { /* Examine other mappings to the page. */ if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty != 0) mincoreinfo |= MINCORE_MODIFIED_OTHER; /* * The first test for PGA_REFERENCED is an * optimization. The second test is * required because a concurrent pmap * operation could clear the last reference * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ - if ((vm_page_aflags(m) & PGA_REFERENCED) != 0 || + if ((m->aflags & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || - (vm_page_aflags(m) & PGA_REFERENCED) != 0) + (m->aflags & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) VM_OBJECT_WUNLOCK(object); /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = atop(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * Pass the page information to the user */ error = subyte(vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = atop(end - first_addr); while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif int sys_mlock(struct thread *td, struct mlock_args *uap) { return (kern_mlock(td->td_proc, td->td_ucred, __DECONST(uintptr_t, uap->addr), uap->len)); } int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) { vm_offset_t addr, end, last, start; vm_size_t npages, size; vm_map_t map; unsigned long nsize; int error; error = priv_check_cred(cred, PRIV_VM_MLOCK); if (error) return (error); addr = addr0; size = len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_user_wired) return (ENOMEM); map = &proc->p_vmspace->vm_map; PROC_LOCK(proc); nsize = ptoa(npages + pmap_wired_count(map->pmap)); if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); #ifdef RACCT if (racct_enable) { PROC_LOCK(proc); error = racct_set(proc, RACCT_MEMLOCK, nsize); PROC_UNLOCK(proc); if (error != 0) return (ENOMEM); } #endif error = vm_map_wire(map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(proc); racct_set(proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif int sys_mlockall(struct thread *td, struct mlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MLOCK); if (error) return (error); if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) return (EINVAL); /* * If wiring all pages in the process would cause it to exceed * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) return (ENOMEM); } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); PROC_UNLOCK(td->td_proc); if (error != 0) return (ENOMEM); } #endif if (uap->how & MCL_FUTURE) { vm_map_lock(map); vm_map_modflags(map, MAP_WIREFUTURE, 0); vm_map_unlock(map); error = 0; } if (uap->how & MCL_CURRENT) { /* * P1003.1-2001 mandates that all currently mapped pages * will be memory resident and locked (wired) upon return * from mlockall(). vm_map_wire() will wire pages, by * calling vm_fault_wire() for each page in the region. */ error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); if (error == KERN_SUCCESS) error = 0; else if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EAGAIN; } #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlockall_args { register_t dummy; }; #endif int sys_munlockall(struct thread *td, struct munlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); /* Clear the MAP_WIREFUTURE flag from this vm_map. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, 0); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif int sys_munlock(struct thread *td, struct munlock_args *uap) { return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); } int kern_munlock(struct thread *td, uintptr_t addr0, size_t size) { vm_offset_t addr, end, last, start; #ifdef RACCT vm_map_t map; #endif int error; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); addr = addr0; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); map = &td->td_proc->p_vmspace->vm_map; racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap_vnode() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_ooffset_t foff; struct ucred *cred; int error, flags; bool writex; cred = td->td_ucred; writex = (*maxprotp & VM_PROT_WRITE) != 0 && (*flagsp & MAP_SHARED) != 0; if ((error = vget(vp, LK_SHARED, td)) != 0) return (error); AUDIT_ARG_VNODE1(vp); foff = *foffp; flags = *flagsp; obj = vp->v_object; if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (obj == NULL) { error = EINVAL; goto done; } if (obj->type == OBJT_VNODE && obj->handle != vp) { vput(vp); vp = (struct vnode *)obj->handle; /* * Bypass filesystems obey the mpsafety of the * underlying fs. Tmpfs never bypasses. */ error = vget(vp, LK_SHARED, td); if (error != 0) return (error); } if (writex) { *writecounted = TRUE; vm_pager_update_writecount(obj, 0, objsize); } } else { error = EINVAL; goto done; } if ((error = VOP_GETATTR(vp, &va, cred))) goto done; #ifdef MAC /* This relies on VM_PROT_* matching PROT_*. */ error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); if (error != 0) goto done; #endif if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { if (prot & VM_PROT_WRITE) { error = EPERM; goto done; } *maxprotp &= ~VM_PROT_WRITE; } } /* * If it is a regular file without any references * we do not need to sync it. * Adjust object size to be the size of actual file. */ objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; if (obj->type == OBJT_VNODE) { obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); if (obj == NULL) { error = ENOMEM; goto done; } } else { KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, ("wrong object type")); VM_OBJECT_WLOCK(obj); vm_object_reference_locked(obj); #if VM_NRESERVLEVEL > 0 vm_object_color(obj, 0); #endif VM_OBJECT_WUNLOCK(obj); } *objp = obj; *flagsp = flags; vfs_mark_atime(vp, cred); done: if (error != 0 && *writecounted) { *writecounted = FALSE; vm_pager_update_writecount(obj, objsize, 0); } vput(vp); return (error); } /* * vm_mmap_cdev() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on cdevs. */ int vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, vm_ooffset_t *foff, vm_object_t *objp) { vm_object_t obj; int error, flags; flags = *flagsp; if (dsw->d_flags & D_MMAP_ANON) { *objp = NULL; *foff = 0; *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; return (0); } /* * cdevs do not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if (flags & (MAP_PRIVATE|MAP_COPY)) return (EINVAL); /* * Force device mappings to be shared. */ flags |= MAP_SHARED; #ifdef MAC_XXX error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); if (error != 0) return (error); #endif /* * First, try d_mmap_single(). If that is not implemented * (returns ENODEV), fall back to using the device pager. * Note that d_mmap_single() must return a reference to the * object (it needs to bump the reference count of the object * it returns somehow). * * XXX assumes VM_PROT_* == PROT_* */ error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); if (error != ENODEV) return (error); obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, td->td_ucred); if (obj == NULL) return (EINVAL); *objp = obj; *flagsp = flags; return (0); } /* * vm_mmap() * * Internal version of mmap used by exec, sys5 shared memory, and * various device drivers. Handle is either a vnode pointer, a * character device, or NULL for MAP_ANON. */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, objtype_t handle_type, void *handle, vm_ooffset_t foff) { vm_object_t object; struct thread *td = curthread; int error; boolean_t writecounted; if (size == 0) return (EINVAL); size = round_page(size); object = NULL; writecounted = FALSE; /* * Lookup/allocate object. */ switch (handle_type) { case OBJT_DEVICE: { struct cdevsw *dsw; struct cdev *cdev; int ref; cdev = handle; dsw = dev_refthread(cdev, &ref); if (dsw == NULL) return (ENXIO); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, dsw, &foff, &object); dev_relthread(cdev, ref); break; } case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, handle, &foff, &object, &writecounted); break; case OBJT_DEFAULT: if (handle == NULL) { error = 0; break; } /* FALLTHROUGH */ default: error = EINVAL; break; } if (error) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0 && object != NULL) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } return (error); } /* * Internal version of mmap that maps a specific VM object into an * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. */ int vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, boolean_t writecounted, struct thread *td) { boolean_t curmap, fitit; vm_offset_t max_addr; int docow, error, findspace, rv; curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { RACCT_PROC_LOCK(td->td_proc); if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > lim_cur(td, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + size); if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); RACCT_PROC_UNLOCK(td->td_proc); return (error); } } RACCT_PROC_UNLOCK(td->td_proc); } /* * We currently can only deal with page aligned file offsets. * The mmap() system call already enforces this by subtracting * the page offset from the file offset, but checking here * catches errors in device drivers (e.g. d_single_mmap() * callbacks) and other internal mapping requests (such as in * exec). */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; } if (flags & MAP_ANON) { if (object != NULL || foff != 0) return (EINVAL); docow = 0; } else if (flags & MAP_PREFAULT_READ) docow = MAP_PREFAULT; else docow = MAP_PREFAULT_PARTIAL; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; if (writecounted) docow |= MAP_WRITECOUNT; if (flags & MAP_STACK) { if (object != NULL) return (EINVAL); docow |= MAP_STACK_GROWS_DOWN; } if ((flags & MAP_EXCL) != 0) docow |= MAP_CHECK_EXCL; if ((flags & MAP_GUARD) != 0) docow |= MAP_CREATE_GUARD; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) findspace = VMFS_SUPER_SPACE; else if ((flags & MAP_ALIGNMENT_MASK) != 0) findspace = VMFS_ALIGNED_SPACE(flags >> MAP_ALIGNMENT_SHIFT); else findspace = VMFS_OPTIMAL_SPACE; max_addr = 0; #ifdef MAP_32BIT if ((flags & MAP_32BIT) != 0) max_addr = MAP_32BIT_MAX_ADDR; #endif if (curmap) { rv = vm_map_find_min(map, object, foff, addr, size, round_page((vm_offset_t)td->td_proc->p_vmspace-> vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, findspace, prot, maxprot, docow); } else { rv = vm_map_find(map, object, foff, addr, size, max_addr, findspace, prot, maxprot, docow); } } else { rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); } if (rv == KERN_SUCCESS) { /* * If the process has requested that all future mappings * be wired, then heed this. */ if ((map->flags & MAP_WIREFUTURE) != 0) { vm_map_lock(map); if ((map->flags & MAP_WIREFUTURE) != 0) (void)vm_map_wire_locked(map, *addr, *addr + size, VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); vm_map_unlock(map); } } return (vm_mmap_to_errno(rv)); } /* * Translate a Mach VM return code to zero on success or the appropriate errno * on failure. */ int vm_mmap_to_errno(int rv) { switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 352406) +++ head/sys/vm/vm_object.c (revision 352407) @@ -1,2597 +1,2597 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static counter_u64_t object_collapses = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, "VM object collapses"); static counter_u64_t object_bypasses = EARLY_COUNTER; SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, "VM object bypasses"); static void counter_startup(void) { object_collapses = counter_u64_alloc(M_WAITOK); object_bypasses = counter_u64_alloc(M_WAITOK); } SYSINIT(object_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(REFCOUNT_COUNT(object->paging_in_progress) == 0, ("object %p paging_in_progress = %d", object, REFCOUNT_COUNT(object->paging_in_progress))); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; vm_radix_init(&object->rtree); refcount_init(&object->paging_in_progress, 0); object->resident_page_count = 0; object->shadow_count = 0; object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; if (type == OBJT_SWAP) pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff() iteration over object_list * sees up to date type and pctrie head if it observed * non-dead object. */ atomic_thread_fence_rel(); switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; object->domain.dr_policy = NULL; object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { refcount_acquiren(&object->paging_in_progress, i); } void vm_object_pip_wakeup(vm_object_t object) { refcount_release(&object->paging_in_progress); } void vm_object_pip_wakeupn(vm_object_t object, short i) { refcount_releasen(&object->paging_in_progress, i); } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (REFCOUNT_COUNT(object->paging_in_progress) > 0) { VM_OBJECT_WUNLOCK(object); refcount_wait(&object->paging_in_progress, waitid, PVM); VM_OBJECT_WLOCK(object); } } void vm_object_pip_wait_unlocked(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_UNLOCKED(object); while (REFCOUNT_COUNT(object->paging_in_progress) > 0) refcount_wait(&object->paging_in_progress, waitid, PVM); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vn_printf(vp, "vm_object_vndeallocate "); panic("vm_object_vndeallocate: bad object reference count"); } #endif if (!umtx_shm_vnobj_persistent && object->ref_count == 1) umtx_shm_object_terminated(object); object->ref_count--; /* vrele may need the vnode lock. */ VM_OBJECT_WUNLOCK(object); vrele(vp); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (REFCOUNT_COUNT(robject->paging_in_progress) > 0) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (REFCOUNT_COUNT(object->paging_in_progress) > 0) { VM_OBJECT_WUNLOCK(robject); VM_OBJECT_WUNLOCK(object); refcount_wait( &object->paging_in_progress, "objde2", PVM); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) { vm_object_set_flag(object, OBJ_DEAD); vm_object_terminate(object); } else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate_pages removes any remaining pageable pages * from the object and resets the object to an empty state. */ static void vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); KASSERT(p->object == object && (p->ref_count & VPRC_OBJREF) != 0, ("vm_object_terminate_pages: page %p is inconsistent", p)); p->object = NULL; if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) { VM_CNT_INC(v_pfree); vm_page_free(p); } } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("terminating non-dead obj %p", object)); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!REFCOUNT_COUNT(object->paging_in_progress), ("vm_object_terminate: pageout in progress")); KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0 && ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) && (object->flags & OBJ_ONEMAPPING) != 0); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE && object->type == OBJT_SWAP) swap_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. */ if (tm->valid != VM_PAGE_BITS_ALL || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_busied(tm)) { if (object != tobject) VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } vm_page_lock(tm); vm_page_advise(tm, advice); vm_page_unlock(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); result->domain = source->domain; LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); new_object->domain = orig_object->domain; source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_sleep_if_busy(m, "spltwt"); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); /* The page is only NULL when rename fails. */ if (p == NULL) { vm_radix_wait(); } else { if (p->object == object) VM_OBJECT_WUNLOCK(backing_object); else VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmocol", false); } VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & (OBJ_DEAD | OBJ_NOSPLIT)) != 0 || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if (REFCOUNT_COUNT(object->paging_in_progress) > 0 || REFCOUNT_COUNT(backing_object->paging_in_progress) > 0) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { vm_object_pip_add(object, 1); vm_object_pip_add(backing_object, 1); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); vm_object_pip_wakeup(object); counter_u64_add(object_collapses, 1); } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); counter_u64_add(object_bypasses, 1); } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ if (vm_page_busied(p)) { vm_page_sleep_if_busy(p, "vmopar"); goto again; } if (vm_page_wired(p)) { wired: if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } continue; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) goto wired; if (p->dirty != 0) continue; } if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_all(p)) goto wired; vm_page_free(p); } vm_object_pip_wakeup(object); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx; vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); vm_page_change_lock(p, &mtx); vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL); if (rv != VM_PAGER_OK) break; /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_NOSPLIT) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if (prev_object->ref_count > 1 && prev_object->size != next_pindex && (prev_object->flags & OBJ_ONEMAPPING) == 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } if (vm_page_xbusied(tm)) { for (tobject = object; locked_depth > 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } vm_page_busy_sleep(tm, "unwbo", true); goto again; } vm_page_unwire(tm, queue); next_page: pindex++; } /* Release the accumulated object locks. */ for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } } /* * Return the vnode for the given object, or NULL if none exists. * For tmpfs objects, the function may return NULL if there is * no vnode allocated at the time of the call. */ struct vnode * vm_object_vnode(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) { vp = object->handle; KASSERT(vp != NULL, ("%s: OBJT_VNODE has no vnode", __func__)); } else if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; KASSERT(vp != NULL, ("%s: OBJT_TMPFS has no vnode", __func__)); } else { vp = NULL; } return (vp); } /* * Return the kvme type of the given object. * If vpp is not NULL, set it to the object's vm_object_vnode() or NULL. */ int vm_object_kvme_type(vm_object_t object, struct vnode **vpp) { VM_OBJECT_ASSERT_LOCKED(object); if (vpp != NULL) *vpp = vm_object_vnode(object); switch (object->type) { case OBJT_DEFAULT: return (KVME_TYPE_DEFAULT); case OBJT_VNODE: return (KVME_TYPE_VNODE); case OBJT_SWAP: if ((object->flags & OBJ_TMPFS_NODE) != 0) return (KVME_TYPE_VNODE); return (KVME_TYPE_SWAP); case OBJT_DEVICE: return (KVME_TYPE_DEVICE); case OBJT_PHYS: return (KVME_TYPE_PHYS); case OBJT_DEAD: return (KVME_TYPE_DEAD); case OBJT_SG: return (KVME_TYPE_SG); case OBJT_MGTDEVICE: return (KVME_TYPE_MGTDEVICE); default: return (KVME_TYPE_UNKNOWN); } } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = obj->shadow_count; kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ - if (m->astate.queue == PQ_ACTIVE) + if (m->queue == PQ_ACTIVE) kvo->kvo_active++; - else if (m->astate.queue == PQ_INACTIVE) + else if (m->queue == PQ_INACTIVE) kvo->kvo_inactive++; } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; kvo->kvo_type = vm_object_kvme_type(obj, &vp); if (vp != NULL) vref(vp); VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 352406) +++ head/sys/vm/vm_page.c (revision 352407) @@ -1,4821 +1,4859 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include -#include +#include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int uma_startup_count(int); extern void uma_startup(void *, int); extern int vmem_startup_count(void); struct vm_domain vm_dom[MAXMEMDOM]; DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; /* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; -static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD, 0, - "VM page stats"); - -static counter_u64_t pqstate_commit_aborts = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, commit_aborts, CTLFLAG_RD, - &pqstate_commit_aborts, - "Failed page queue state updates"); - -static counter_u64_t queue_ops = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, - &queue_ops, - "Batched queue operations"); - -static counter_u64_t null_queue_ops = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, null_queue_ops, CTLFLAG_RD, - &null_queue_ops, - "Batched queue operations with no effect"); - -static void -counter_startup(void) -{ - - pqstate_commit_aborts = counter_u64_alloc(M_WAITOK); - queue_ops = counter_u64_alloc(M_WAITOK); - null_queue_ops = counter_u64_alloc(M_WAITOK); -} -SYSINIT(page_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL); - /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; static int boot_pages; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); +static void vm_page_dequeue_complete(vm_page_t m); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); -static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, - const uint16_t nflag); +static void vm_page_mvqueue(vm_page_t m, uint8_t queue); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); -static bool vm_page_release_toq(vm_page_t m, uint8_t queue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags); static void vm_page_zone_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * The cache page zone is initialized later since we need to be able to allocate * pages before UMA is fully initialized. */ static void vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int domain, pool; for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); /* * Don't allow the page caches to take up more than .25% of * memory. */ if (vmd->vmd_page_count / 400 < 256 * mp_ncpus * VM_NFREEPOOL) continue; for (pool = 0; pool < VM_NFREEPOOL; pool++) { pgcache = &vmd->vmd_pgcache[pool]; pgcache->domain = domain; pgcache->pool = pool; pgcache->zone = uma_zcache_create("vm pgcache", sizeof(struct vm_page), NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); (void)uma_zone_set_maxcache(pgcache->zone, 0); } } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose) { struct vm_domain *vmd; vm_page_t m; int ret; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) return (true); /* page does not exist, no failure */ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); vm_domain_free_unlock(vmd); if (ret != 0) { vm_domain_freecnt_inc(vmd, -1); TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (verbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } return (ret); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; char *next; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; vm_page_blacklist_add(pa, bootverbose); } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Initialize a dummy page for use in scans of the specified paging queue. * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies the page as a safety precaution. */ static void vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags) { bzero(marker, sizeof(*marker)); - marker->busy_lock = VPB_SINGLE_EXCLUSIVER; - marker->astate.flags = aflags; - marker->astate.queue = queue; marker->flags = PG_MARKER; + marker->aflags = aflags; + marker->busy_lock = VPB_SINGLE_EXCLUSIVER; + marker->queue = queue; } static void vm_page_domain_init(int domain) { struct vm_domain *vmd; struct vm_pagequeue *pq; int i; vmd = VM_DOMAIN(domain); bzero(vmd, sizeof(*vmd)); *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); pq->pq_pdpages = 0; vm_page_init_marker(&vmd->vmd_markers[i], i, 0); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); /* * inacthead is used to provide FIFO ordering for LRU-bypassing * insertions. */ vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The clock pages are used to implement active queue scanning without * requeues. Scans start at clock[0], which is advanced after the scan * ends. When the two clock hands meet, they are reset and scanning * resumes from the head of the queue. */ vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[1], plinks.q); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ static void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_UNBUSIED; - m->flags = 0; + m->flags = m->aflags = 0; m->phys_addr = pa; - m->astate.flags = 0; - m->astate.queue = PQ_NONE; + m->queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } #ifndef PMAP_HAS_PAGE_ARRAY static vm_paddr_t vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) { vm_paddr_t new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ *vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = page_range; return (new_end); } #endif /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_phys_seg *seg; vm_page_t m; char *list, *listend; vm_offset_t mapped; vm_paddr_t end, high_avail, low_avail, new_end, page_range, size; vm_paddr_t last_pa, pa; u_long pagecount; int biggestone, i, segind; #ifdef WITNESS int witness_size; #endif #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif vaddr = round_page(vaddr); vm_phys_early_startup(); biggestone = vm_phys_avail_largest(); end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(i); /* * Allocate memory for use when boot strapping the kernel memory * allocator. Tell UMA how many zones we are going to create * before going fully functional. UMA will add its zones. * * VM startup zones: vmem, vmem_btag, VM OBJECT, RADIX NODE, MAP, * KMAP ENTRY, MAP ENTRY, VMSPACE. */ boot_pages = uma_startup_count(8); #ifndef UMA_MD_SMALL_ALLOC /* vmem_startup() calls uma_prealloc(). */ boot_pages += vmem_startup_count(); /* vm_map_startup() calls uma_prealloc(). */ boot_pages += howmany(MAX_KMAP, UMA_SLAB_SPACE / sizeof(struct vm_map)); /* * Before going fully functional kmem_init() does allocation * from "KMAP ENTRY" and vmem_create() does allocation from "vmem". */ boot_pages += 2; #endif /* * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #ifdef WITNESS witness_size = round_page(witness_startup_count()); new_end -= witness_size; mapped = pmap_map(&vaddr, new_end, new_end + witness_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, witness_size); witness_startup((void *)mapped); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) || defined(__riscv) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) /* * Include the UMA bootstrap pages, witness pages and vm_page_dump * in a crash dump. When pmap_map() uses the direct map, they are * not automatically included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef PMAP_HAS_PAGE_ARRAY pmap_page_array_startup(size / PAGE_SIZE); biggestone = vm_phys_avail_largest(); end = new_end = phys_avail[biggestone + 1]; #else #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; new_end = vm_page_array_alloc(&vaddr, end, page_range); #endif #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ new_end = vm_reserv_startup(&vaddr, new_end); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) if (vm_phys_avail_size(i) != 0) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); m->flags = PG_FICTITIOUS; } #endif vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment to the free lists only if it is covered by * one of the ranges in phys_avail. Because we've added the * ranges to the vm_phys_segs array, we can assume that each * segment is either entirely contained in one of the ranges, * or doesn't overlap any of them. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { struct vm_domain *vmd; if (seg->start < phys_avail[i] || seg->end > phys_avail[i + 1]) continue; m = seg->first_page; pagecount = (u_long)atop(seg->end - seg->start); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd = VM_DOMAIN(seg->domain); vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << m->segind; break; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_acquire: * * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop * and drop the object lock if necessary. */ int vm_page_busy_acquire(vm_page_t m, int allocflags) { vm_object_t obj; u_int x; bool locked; /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; for (;;) { if ((allocflags & VM_ALLOC_SBUSY) == 0) { if (vm_page_tryxbusy(m)) return (TRUE); } else { if (vm_page_trysbusy(m)) return (TRUE); } if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (FALSE); if (obj != NULL) { locked = VM_OBJECT_WOWNED(obj); } else { MPASS(vm_page_wired(m)); locked = FALSE; } sleepq_lock(m); x = m->busy_lock; if (x == VPB_UNBUSIED || ((allocflags & VM_ALLOC_SBUSY) != 0 && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { sleepq_release(m); continue; } if (locked) VM_OBJECT_WUNLOCK(obj); sleepq_add(m, NULL, "vmpba", 0, 0); sleepq_wait(m, PVM); if (locked) VM_OBJECT_WLOCK(obj); MPASS(m->object == obj || m->object == NULL); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (FALSE); } } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); x = m->busy_lock; for (;;) { if (atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_SHARERS_WORD(1))) break; } if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); x = m->busy_lock; for (;;) { if (VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) == 0) break; wakeup(m); break; } } /* * vm_page_busy_sleep: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If nonshared is true, sleep only if the page is xbusy. * * The object lock must be held on entry and will be released on exit. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { vm_object_t obj; u_int x; obj = m->object; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_LOCKED(obj); sleepq_lock(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { VM_OBJECT_DROP(obj); sleepq_release(m); return; } VM_OBJECT_DROP(obj); sleepq_add(m, NULL, wmesg, 0, 0); sleepq_wait(m, PVM); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; x = m->busy_lock; for (;;) { if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_fcmpset_acq_int(&m->busy_lock, &x, x + VPB_ONE_SHARER)) return (1); } } /* * vm_page_xunbusy_hard: * * Called when unbusy has failed because there is a waiter. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); /* * Wake the waiter. */ atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); wakeup(m); } /* * Avoid releasing and reacquiring the same page lock. */ void vm_page_change_lock(vm_page_t m, struct mtx **mtx) { struct mtx *mtx1; mtx1 = vm_page_lockptr(m); if (*mtx == mtx1) return; if (*mtx != NULL) mtx_unlock(*mtx); *mtx = mtx1; mtx_lock(mtx1); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { for (; count != 0; count--) { vm_page_unwire(*ma, PQ_ACTIVE); ma++; } } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; - m->astate.queue = PQ_NONE; + m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; /* Fictitious pages are unevictable. */ m->ref_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ + vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); + vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the object lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_sleep_if_xbusy: * * Sleep and release the object lock if the page is xbusied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_xbusied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_busy_sleep(m, msg, true); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page. */ m->object = object; m->pindex = pindex; m->ref_count |= VPRC_OBJREF; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; m->ref_count &= ~VPRC_OBJREF; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("vm_page_insert_radixdone: page %p is missing object ref", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_radixdone: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_radixdone: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * Do the work to remove a page from its object. The caller is responsible for * updating the page's fields to reflect this removal. */ static void vm_page_object_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("page %p is missing its object ref", m)); if (vm_page_xbusied(m)) vm_page_xunbusy(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. Returns true if the object's reference * was the last reference to the page, and false otherwise. * * The object must be locked. */ bool vm_page_remove(vm_page_t m) { vm_page_object_remove(m); m->object = NULL; return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, ("vm_page_replace: page %p already in object", mnew)); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mold = vm_radix_replace(&object->rtree, mnew); - KASSERT(mold->astate.queue == PQ_NONE, + KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: old page %p is on a paging queue", mold)); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; atomic_clear_int(&mold->ref_count, VPRC_OBJREF); vm_page_xunbusy(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m)); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_object_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ int vm_domain_allocate(struct vm_domain *vmd, int req, int npages) { u_int limit, old, new; req = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req != VM_ALLOC_INTERRUPT) req = VM_ALLOC_SYSTEM; if (req == VM_ALLOC_INTERRUPT) limit = 0; else if (req == VM_ALLOC_SYSTEM) limit = vmd->vmd_interrupt_free_min; else limit = vmd->vmd_free_reserved; /* * Attempt to reserve the pages. Fail if we're below the limit. */ limit += npages; old = vmd->vmd_free_count; do { if (old < limit) return (0); new = old - npages; } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); /* Wake the page daemon if we've crossed the threshold. */ if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) pagedaemon_wakeup(vmd->vmd_domain); /* Only update bitsets on transitions. */ if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) vm_domain_set(vmd); return (1); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m; int flags, pool; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); flags = 0; m = NULL; pool = object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the page from a reservation? */ if (vm_object_reserv(object) && (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) != NULL) { domain = vm_phys_domain(m); vmd = VM_DOMAIN(domain); goto found; } #endif vmd = VM_DOMAIN(domain); if (vmd->vmd_pgcache[pool].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[pool].zone, M_NOWAIT); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { /* * If not, allocate it from the free page queues. */ vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, pool, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ if ((req & VM_ALLOC_ZERO) != 0) flags |= (m->flags & PG_ZERO); if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; - m->astate.flags = 0; + m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ vm_wire_add(1); m->ref_count = 1; } - m->astate.act_count = 0; + m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->ref_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domain *vmd; vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ m_ret = NULL; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the pages from a reservation? */ if (vm_object_reserv(object) && (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req, mpred, npages, low, high, alignment, boundary)) != NULL) { domain = vm_phys_domain(m_ret); vmd = VM_DOMAIN(domain); goto found; } #endif vmd = VM_DOMAIN(domain); if (vm_domain_allocate(vmd, req, npages)) { /* * allocate them from the free page queues. */ vm_domain_free_lock(vmd); m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); vm_domain_free_unlock(vmd); if (m_ret == NULL) { vm_domain_freecnt_inc(vmd, npages); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(domain, npages, low, high, alignment, boundary)) goto again; #endif } } if (m_ret == NULL) { if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } #if VM_NRESERVLEVEL > 0 found: #endif for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { - m->astate.flags = 0; + m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; - m->astate.act_count = 0; + m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->ref_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); - KASSERT(m->astate.queue == PQ_NONE && - (m->astate.flags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", - m, m->astate.queue, (m->astate.flags & PGA_QUEUE_STATE_MASK))); + m, m->queue, (m->aflags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { struct vm_domain *vmd; vm_page_t m; u_int flags; m = NULL; vmd = VM_DOMAIN(domain); again: if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); vm_domain_free_unlock(vmd); if (m == NULL) vm_domain_freecnt_inc(vmd, 1); } if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ - m->astate.flags = 0; + m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ vm_wire_add(1); m->ref_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; return (m); } static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); /* Only import if we can bring in a full bucket. */ if (cnt == 1 || !vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, (vm_page_t *)store); vm_domain_free_unlock(vmd); if (cnt != i) vm_domain_freecnt_inc(vmd, cnt - i); return (i); } static void vm_page_zone_release(void *arg, void **store, int cnt) { struct vm_domain *vmd; struct vm_pgcache *pgcache; vm_page_t m; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); vm_domain_free_lock(vmd); for (i = 0; i < cnt; i++) { m = (vm_page_t)store[i]; vm_phys_free_pages(m, 0); } vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, ("fictitious page %p has invalid ref count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); vm_page_change_lock(m, &m_mtx); m_inc = 1; retry: if (vm_page_wired(m)) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } } /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && - !vm_page_busied(m) && !vm_page_wired(m)) { + vm_page_queue(m) != PQ_NONE && !vm_page_busied(m) && + !vm_page_wired(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct vm_domain *vmd; struct mtx *m_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ vm_page_change_lock(m, &m_mtx); retry: /* * Racily check for wirings. Races are handled below. */ if (vm_page_wired(m)) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } } /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; - else if (!vm_page_busied(m) && !vm_page_wired(m)) { + else if (vm_page_queue(m) != PQ_NONE && + !vm_page_busied(m) && !vm_page_wired(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0 && !vm_page_try_remove_all(m)) { error = EBUSY; goto unlock; } - m_new->astate.flags = m->astate.flags & + m_new->aflags = m->aflags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_dequeue(m); vm_page_replace_checked(m_new, object, m->pindex, m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_change_lock(m_new, &m_mtx); vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_phys_domain(m) == domain); vmd = VM_DOMAIN(domain); vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { int cnt; vmd = VM_DOMAIN(domain); cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_phys_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_free_pages(m, 0); cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform vm_wait() before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (ret); } /* * Set the domain in the appropriate page level domainset. */ void vm_domain_set(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (!vmd->vmd_minset && vm_paging_min(vmd)) { vmd->vmd_minset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); } mtx_unlock(&vm_domainset_lock); } /* * Clear the domain from the appropriate page level domainset. */ void vm_domain_clear(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_minset && !vm_paging_min(vmd)) { vmd->vmd_minset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); if (vm_min_waiters != 0) { vm_min_waiters = 0; wakeup(&vm_min_domains); } } if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { vmd->vmd_severeset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); if (vm_severe_waiters != 0) { vm_severe_waiters = 0; wakeup(&vm_severe_domains); } } /* * If pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* See comments in vm_wait_doms(). */ if (vm_pageproc_waiters) { vm_pageproc_waiters = 0; wakeup(&vm_pageproc_waiters); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the min threshold globally. */ void vm_wait_min(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the severe threshold globally. */ void vm_wait_severe(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_severe()) { vm_severe_waiters++; msleep(&vm_severe_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } u_int vm_wait_count(void) { return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } void vm_wait_doms(const domainset_t *wdoms) { /* * We use racey wakeup synchronization to avoid expensive global * locking for the pageproc when sleeping with a non-specific vm_wait. * To handle this, we only sleep for one tick in this instance. It * is expected that most allocations for the pageproc will come from * kmem or vm_page_grab* which will use the more specific and * race-free vm_wait_domain(). */ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } } /* * vm_wait_domain: * * Sleep until free pages are available for allocation. * - Called in various places after failed memory allocations. */ void vm_wait_domain(int domain) { struct vm_domain *vmd; domainset_t wdom; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); } else mtx_unlock(&vm_domainset_lock); } else { if (pageproc == NULL) panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); vm_wait_doms(&wdom); } } /* * vm_wait: * * Sleep until free pages are available for allocation in the * affinity domains of the obj. If obj is NULL, the domain set * for the calling thread is used. * Called in various places after failed memory allocations. */ void vm_wait(vm_object_t obj) { struct domainset *d; d = NULL; /* * Carefully fetch pointers only once: the struct domainset * itself is ummutable but the pointer might change. */ if (obj != NULL) d = obj->domain.dr_policy; if (d == NULL) d = curthread->td_domain.dr_policy; vm_wait_doms(&d->ds_mask); } /* * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } return (0); } /* * vm_waitpfault: * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(struct domainset *dset, int timo) { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", timo); } else mtx_unlock(&vm_domainset_lock); } -bool -vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) +static struct vm_pagequeue * +vm_page_pagequeue(vm_page_t m) { - vm_page_t next; - struct vm_pagequeue *pq; - int mask; - if (old->queue != PQ_NONE && old->queue != new.queue) { - new.flags &= ~PGA_ENQUEUED; + uint8_t queue; - pq = _vm_page_pagequeue(m, old->queue); - - /* - * The physical queue state might change at any point before the - * page queue lock is acquired, so we must verify that the lock - * is correct before proceeding. Once the page's queue index is - * changed, the page queue lock we hold will no longer - * synchronize the physical queue state of the page, so we must - * awkwardly remove the page from the queue and put it back if - * the commit fails. - */ - vm_pagequeue_lock(pq); - if (__predict_false(m->astate.queue != old->queue)) { - vm_pagequeue_unlock(pq); - *old = vm_page_astate_load(m); - return (false); - } - if (__predict_true((m->astate.flags & PGA_ENQUEUED) != 0)) { - next = TAILQ_NEXT(m, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - } - if (__predict_false(!vm_page_astate_fcmpset(m, old, new))) { - if ((old->flags & PGA_ENQUEUED) != 0) { - if (next == NULL) - TAILQ_INSERT_TAIL(&pq->pq_pl, m, - plinks.q); - else - TAILQ_INSERT_BEFORE(next, m, plinks.q); - } - vm_pagequeue_unlock(pq); - counter_u64_add(pqstate_commit_aborts, 1); - return (false); - } - if ((old->flags & PGA_ENQUEUED) != 0) - vm_pagequeue_cnt_dec(pq); - vm_pagequeue_unlock(pq); - } else if (__predict_false(!vm_page_astate_fcmpset(m, old, new))) { - counter_u64_add(pqstate_commit_aborts, 1); - return (false); - } - - if (new.queue != PQ_NONE) { - mask = new.flags & PGA_QUEUE_OP_MASK; - if (mask != 0 && (old->flags & mask) != mask) - vm_page_pqbatch_submit(m, new.queue); - } - - return (true); + if ((queue = atomic_load_8(&m->queue)) == PQ_NONE) + return (NULL); + return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } static inline void -vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) +vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m) { - vm_page_t next; struct vm_domain *vmd; - vm_page_astate_t old, new; + uint8_t qflags; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); - old = vm_page_astate_load(m); -retry: - if (__predict_false(old.queue != queue)) - return; - KASSERT(pq == _vm_page_pagequeue(m, queue), - ("page %p does not belong to queue %p", m, pq)); - KASSERT(old.queue != PQ_NONE || (old.flags & PGA_QUEUE_STATE_MASK) == 0, - ("page %p has unexpected queue state", m)); - /* - * Update the page's queue state before modifying the page queues - * themselves, to avoid having to roll back updates when a queue state - * update fails and requires a retry. + * The page daemon is allowed to set m->queue = PQ_NONE without + * the page queue lock held. In this case it is about to free the page, + * which must not have any queue state. */ - new = old; - if ((old.flags & PGA_DEQUEUE) != 0) { - new.queue = PQ_NONE; - new.flags &= ~PGA_QUEUE_STATE_MASK; - if (__predict_true((old.flags & PGA_ENQUEUED) != 0)) { - next = TAILQ_NEXT(m, plinks.q); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - } - if (__predict_false(!vm_page_astate_fcmpset(m, &old, new))) { - if ((old.flags & PGA_ENQUEUED) != 0) { - if (next == NULL) - TAILQ_INSERT_TAIL(&pq->pq_pl, m, - plinks.q); - else - TAILQ_INSERT_BEFORE(next, m, plinks.q); - } - counter_u64_add(pqstate_commit_aborts, 1); - goto retry; - } - if ((old.flags & PGA_ENQUEUED) != 0) - vm_pagequeue_cnt_dec(pq); - counter_u64_add(queue_ops, 1); - } else if ((old.flags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { - new.flags |= PGA_ENQUEUED; - new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); - if (__predict_false(!vm_page_astate_fcmpset(m, &old, new))) { - counter_u64_add(pqstate_commit_aborts, 1); - goto retry; - } + qflags = atomic_load_8(&m->aflags); + KASSERT(pq == vm_page_pagequeue(m) || + (qflags & PGA_QUEUE_STATE_MASK) == 0, + ("page %p doesn't belong to queue %p but has aflags %#x", + m, pq, qflags)); - if ((old.flags & PGA_ENQUEUED) != 0) + if ((qflags & PGA_DEQUEUE) != 0) { + if (__predict_true((qflags & PGA_ENQUEUED) != 0)) + vm_pagequeue_remove(pq, m); + vm_page_dequeue_complete(m); + } else if ((qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { + if ((qflags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); - else + else { vm_pagequeue_cnt_inc(pq); + vm_page_aflag_set(m, PGA_ENQUEUED); + } /* - * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In - * particular, if both flags are set in close succession, only - * PGA_REQUEUE_HEAD will be applied, even if it was set first. + * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. + * In particular, if both flags are set in close succession, + * only PGA_REQUEUE_HEAD will be applied, even if it was set + * first. */ - if ((old.flags & PGA_REQUEUE_HEAD) != 0) { - KASSERT(old.queue == PQ_INACTIVE, + if ((qflags & PGA_REQUEUE_HEAD) != 0) { + KASSERT(m->queue == PQ_INACTIVE, ("head enqueue not supported for page %p", m)); vmd = vm_pagequeue_domain(m); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); - } else { + } else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); - } - counter_u64_add(queue_ops, 1); - } else { - counter_u64_add(null_queue_ops, 1); + + vm_page_aflag_clear(m, qflags & (PGA_REQUEUE | + PGA_REQUEUE_HEAD)); } } static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { + vm_page_t m; int i; - for (i = 0; i < bq->bq_cnt; i++) - vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); + for (i = 0; i < bq->bq_cnt; i++) { + m = bq->bq_pa[i]; + if (__predict_false(m->queue != queue)) + continue; + vm_pqbatch_process_page(pq, m); + } vm_batchqueue_init(bq); } /* * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. + * The caller must have encoded the requested operation in the page + * structure's aflags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) { struct vm_batchqueue *bq; struct vm_pagequeue *pq; int domain; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("page %p is unmanaged", m)); + KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, + ("missing synchronization for page %p", m)); KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_phys_domain(m); pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); if (vm_batchqueue_insert(bq, m)) { critical_exit(); return; } if (!vm_pagequeue_trylock(pq)) { critical_exit(); vm_pagequeue_lock(pq); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); } vm_pqbatch_process(pq, bq, queue); - vm_pqbatch_process_page(pq, m, queue); + + /* + * The page may have been logically dequeued before we acquired the + * page queue lock. In this case, since we either hold the page lock + * or the page is being freed, a different thread cannot be concurrently + * enqueuing the page. + */ + if (__predict_true(m->queue == queue)) + vm_pqbatch_process_page(pq, m); + else { + KASSERT(m->queue == PQ_NONE, + ("invalid queue transition for page %p", m)); + KASSERT((m->aflags & PGA_ENQUEUED) == 0, + ("page %p is enqueued with invalid queue index", m)); + } vm_pagequeue_unlock(pq); critical_exit(); } /* * vm_page_pqbatch_drain: [ internal use only ] * * Force all per-CPU page queue batch queues to be drained. This is * intended for use in severe memory shortages, to ensure that pages * do not remain stuck in the batch queues. */ void vm_page_pqbatch_drain(void) { struct thread *td; struct vm_domain *vmd; struct vm_pagequeue *pq; int cpu, domain, queue; td = curthread; CPU_FOREACH(cpu) { thread_lock(td); sched_bind(td, cpu); thread_unlock(td); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (queue = 0; queue < PQ_COUNT; queue++) { pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); vm_pqbatch_process(pq, DPCPU_PTR(pqbatch[domain][queue]), queue); critical_exit(); vm_pagequeue_unlock(pq); } } } thread_lock(td); sched_unbind(td); thread_unlock(td); } -/* XXX comment */ +/* + * Complete the logical removal of a page from a page queue. We must be + * careful to synchronize with the page daemon, which may be concurrently + * examining the page with only the page lock held. The page must not be + * in a state where it appears to be logically enqueued. + */ static void -vm_page_dequeue_free(vm_page_t m) +vm_page_dequeue_complete(vm_page_t m) { - vm_page_astate_t old, new; - for (old = vm_page_astate_load(m);;) { - if (old.queue == PQ_NONE) { - KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, - ("page %p has unexpected queue state flags %#x", - m, old.flags)); - break; - } - if ((old.flags & PGA_DEQUEUE) != 0) { - vm_page_pqbatch_submit(m, old.queue); - break; - } - new = old; - new.flags |= PGA_DEQUEUE; - if (vm_page_pqstate_commit(m, &old, new)) - break; - } + m->queue = PQ_NONE; + atomic_thread_fence_rel(); + vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); } /* + * vm_page_dequeue_deferred: [ internal use only ] + * + * Request removal of the given page from its current page + * queue. Physical removal from the queue may be deferred + * indefinitely. + * + * The page must be locked. + */ +void +vm_page_dequeue_deferred(vm_page_t m) +{ + uint8_t queue; + + vm_page_assert_locked(m); + + if ((queue = vm_page_queue(m)) == PQ_NONE) + return; + + /* + * Set PGA_DEQUEUE if it is not already set to handle a concurrent call + * to vm_page_dequeue_deferred_free(). In particular, avoid modifying + * the page's queue state once vm_page_dequeue_deferred_free() has been + * called. In the event of a race, two batch queue entries for the page + * will be created, but the second will have no effect. + */ + if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, PGA_DEQUEUE)) + vm_page_pqbatch_submit(m, queue); +} + +/* + * A variant of vm_page_dequeue_deferred() that does not assert the page + * lock and is only to be called from vm_page_free_prep(). Because the + * page is being freed, we can assume that nothing other than the page + * daemon is scheduling queue operations on this page, so we get for + * free the mutual exclusion that is otherwise provided by the page lock. + * To handle races, the page daemon must take care to atomically check + * for PGA_DEQUEUE when updating queue state. + */ +static void +vm_page_dequeue_deferred_free(vm_page_t m) +{ + uint8_t queue; + + KASSERT(m->ref_count == 0, ("page %p has references", m)); + + if ((m->aflags & PGA_DEQUEUE) != 0) + return; + atomic_thread_fence_acq(); + if ((queue = m->queue) == PQ_NONE) + return; + vm_page_aflag_set(m, PGA_DEQUEUE); + vm_page_pqbatch_submit(m, queue); +} + +/* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any. - * XXX + * The page must either be locked or unallocated. This constraint + * ensures that the queue state of the page will remain consistent + * after this function returns. */ void vm_page_dequeue(vm_page_t m) { - vm_page_astate_t old, new; + struct vm_pagequeue *pq, *pq1; + uint8_t aflags; - for (old = vm_page_astate_load(m);;) { - if (old.queue == PQ_NONE) { - KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, + ("page %p is allocated and unlocked", m)); + + for (pq = vm_page_pagequeue(m);; pq = pq1) { + if (pq == NULL) { + /* + * A thread may be concurrently executing + * vm_page_dequeue_complete(). Ensure that all queue + * state is cleared before we return. + */ + aflags = atomic_load_8(&m->aflags); + if ((aflags & PGA_QUEUE_STATE_MASK) == 0) + return; + KASSERT((aflags & PGA_DEQUEUE) != 0, ("page %p has unexpected queue state flags %#x", - m, old.flags)); - break; + m, aflags)); + + /* + * Busy wait until the thread updating queue state is + * finished. Such a thread must be executing in a + * critical section. + */ + cpu_spinwait(); + pq1 = vm_page_pagequeue(m); + continue; } - new = old; - new.queue = PQ_NONE; - new.flags &= ~PGA_QUEUE_STATE_MASK; - if (vm_page_pqstate_commit(m, &old, new)) + vm_pagequeue_lock(pq); + if ((pq1 = vm_page_pagequeue(m)) == pq) break; + vm_pagequeue_unlock(pq); } + KASSERT(pq == vm_page_pagequeue(m), + ("%s: page %p migrated directly between queues", __func__, m)); + KASSERT((m->aflags & PGA_DEQUEUE) != 0 || + mtx_owned(vm_page_lockptr(m)), + ("%s: queued unlocked page %p", __func__, m)); + + if ((m->aflags & PGA_ENQUEUED) != 0) + vm_pagequeue_remove(pq, m); + vm_page_dequeue_complete(m); + vm_pagequeue_unlock(pq); } /* * Schedule the given page for insertion into the specified page queue. * Physical insertion of the page may be deferred indefinitely. */ static void vm_page_enqueue(vm_page_t m, uint8_t queue) { vm_page_assert_locked(m); - KASSERT(m->astate.queue == PQ_NONE && - (m->astate.flags & PGA_QUEUE_STATE_MASK) == 0, + KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); - m->astate.queue = queue; - if ((m->astate.flags & PGA_REQUEUE) == 0) + m->queue = queue; + if ((m->aflags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } /* + * vm_page_requeue: [ internal use only ] + * + * Schedule a requeue of the given page. + * + * The page must be locked. + */ +void +vm_page_requeue(vm_page_t m) +{ + + vm_page_assert_locked(m); + KASSERT(vm_page_queue(m) != PQ_NONE, + ("%s: page %p is not logically enqueued", __func__, m)); + + if ((m->aflags & PGA_REQUEUE) == 0) + vm_page_aflag_set(m, PGA_REQUEUE); + vm_page_pqbatch_submit(m, atomic_load_8(&m->queue)); +} + +/* + * vm_page_swapqueue: [ internal use only ] + * + * Move the page from one queue to another, or to the tail of its + * current queue, in the face of a possible concurrent call to + * vm_page_dequeue_deferred_free(). + */ +void +vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq) +{ + struct vm_pagequeue *pq; + + KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT && oldq != newq, + ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("vm_page_swapqueue: page %p is unmanaged", m)); + vm_page_assert_locked(m); + + /* + * Atomically update the queue field and set PGA_REQUEUE while + * ensuring that PGA_DEQUEUE has not been set. + */ + pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq]; + vm_pagequeue_lock(pq); + if (!vm_page_pqstate_cmpset(m, oldq, newq, PGA_DEQUEUE, PGA_REQUEUE)) { + vm_pagequeue_unlock(pq); + return; + } + if ((m->aflags & PGA_ENQUEUED) != 0) { + vm_pagequeue_remove(pq, m); + vm_page_aflag_clear(m, PGA_ENQUEUED); + } + vm_pagequeue_unlock(pq); + vm_page_pqbatch_submit(m, newq); +} + +/* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object must be locked. The page must be locked if it is * managed. */ bool vm_page_free_prep(vm_page_t m) { /* * Synchronize with threads that have dropped a reference to this * page. */ atomic_thread_fence_acq(); #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) - KASSERT(!pmap_page_is_mapped(m) && (vm_page_aflags(m) & - (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, + KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); else - KASSERT(m->astate.queue == PQ_NONE, + KASSERT(m->queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); VM_CNT_INC(v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free_prep: freeing busy page %p", m); if (m->object != NULL) { vm_page_object_remove(m); /* * The object reference can be released without an atomic * operation. */ KASSERT((m->flags & PG_FICTITIOUS) != 0 || m->ref_count == VPRC_OBJREF, ("vm_page_free_prep: page %p has unexpected ref_count %u", m, m->ref_count)); m->object = NULL; m->ref_count -= VPRC_OBJREF; } /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); - KASSERT(m->astate.queue == PQ_NONE, + KASSERT(m->queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } /* * Pages need not be dequeued before they are returned to the physical * memory allocator, but they must at least be marked for a deferred * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_dequeue_free(m); + vm_page_dequeue_deferred_free(m); m->valid = 0; vm_page_undirty(m); if (m->ref_count != 0) panic("vm_page_free_prep: page %p has references", m); /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); #if VM_NRESERVLEVEL > 0 /* * Determine whether the page belongs to a reservation. If the page was * allocated from a per-CPU cache, it cannot belong to a reservation, so * as an optimization, we avoid the check in that case. */ if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) return (false); #endif return (true); } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be locked if it is * managed. */ void vm_page_free_toq(vm_page_t m) { struct vm_domain *vmd; uma_zone_t zone; if (!vm_page_free_prep(m)) return; vmd = vm_pagequeue_domain(m); zone = vmd->vmd_pgcache[m->pool].zone; if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { uma_zfree(zone, m); return; } vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); } /* * vm_page_free_pages_toq: * * Returns a list of pages to the free list, disassociating it * from any VM object. In other words, this is equivalent to * calling vm_page_free_toq() for each page of a list of VM objects. * * The objects must be locked. The pages must be locked if it is * managed. */ void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; int count; if (SLIST_EMPTY(free)) return; count = 0; while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); vm_page_free_toq(m); } if (update_wire_count) vm_wire_sub(count); } /* * Mark this page as wired down, preventing reclamation by the page daemon * or when the containing object is destroyed. */ void vm_page_wire(vm_page_t m) { u_int old; KASSERT(m->object != NULL, ("vm_page_wire: page %p does not belong to an object", m)); if (!vm_page_busied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(m->ref_count) >= 1, ("vm_page_wire: fictitious page %p has zero wirings", m)); old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); - if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_aflag_set(m, PGA_DEQUEUE); if (VPRC_WIRE_COUNT(old) == 0) vm_wire_add(1); } /* * Attempt to wire a mapped page following a pmap lookup of that page. * This may fail if a thread is concurrently tearing down mappings of the page. */ bool vm_page_wire_mapped(vm_page_t m) { u_int old; old = m->ref_count; do { KASSERT(old > 0, ("vm_page_wire_mapped: wiring unreferenced page %p", m)); if ((old & VPRC_BLOCKED) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); - if ((m->oflags & VPO_UNMANAGED) == 0) - vm_page_aflag_set(m, PGA_DEQUEUE); if (VPRC_WIRE_COUNT(old) == 0) vm_wire_add(1); return (true); } -/* XXX comment */ -static void -vm_page_unwire_managed(vm_page_t m, uint8_t queue, bool noreuse) -{ - u_int old; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("vm_page_unwire_managed: page %p is unmanaged", m)); - - /* - * Update LRU state before releasing the wiring reference. - * Use a release store when updating the reference count to - * synchronize with vm_page_free_prep(). - */ - old = m->ref_count; - do { - KASSERT(VPRC_WIRE_COUNT(old) > 0, - ("vm_page_unwire: wire count underflow for page %p", m)); - if (VPRC_WIRE_COUNT(old) == 1 && - !vm_page_release_toq(m, queue, noreuse)) { - old = atomic_load_int(&m->ref_count); - continue; - } - } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); - - if (VPRC_WIRE_COUNT(old) == 1) { - vm_wire_sub(1); - if (old == 1) - vm_page_free(m); - } -} - /* * Release one wiring of the specified page, potentially allowing it to be * paged out. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue. If the released wiring * represented the last reference to the page, the page is freed. * * A managed page must be locked. */ void vm_page_unwire(vm_page_t m, uint8_t queue) { + u_int old; + bool locked; KASSERT(queue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); - } else { - vm_page_unwire_managed(m, queue, false); + return; } + + /* + * Update LRU state before releasing the wiring reference. + * We only need to do this once since we hold the page lock. + * Use a release store when updating the reference count to + * synchronize with vm_page_free_prep(). + */ + old = m->ref_count; + locked = false; + do { + KASSERT(VPRC_WIRE_COUNT(old) > 0, + ("vm_page_unwire: wire count underflow for page %p", m)); + if (!locked && VPRC_WIRE_COUNT(old) == 1) { + vm_page_lock(m); + locked = true; + if (queue == PQ_ACTIVE && vm_page_queue(m) == PQ_ACTIVE) + vm_page_reference(m); + else + vm_page_mvqueue(m, queue); + } + } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); + + /* + * Release the lock only after the wiring is released, to ensure that + * the page daemon does not encounter and dequeue the page while it is + * still wired. + */ + if (locked) + vm_page_unlock(m); + + if (VPRC_WIRE_COUNT(old) == 1) { + vm_wire_sub(1); + if (old == 1) + vm_page_free(m); + } } /* * Unwire a page without (re-)inserting it into a page queue. It is up * to the caller to enqueue, requeue, or free the page as appropriate. * In most cases involving managed pages, vm_page_unwire() should be used * instead. */ bool vm_page_unwire_noq(vm_page_t m) { u_int old; old = vm_page_drop(m, 1); KASSERT(VPRC_WIRE_COUNT(old) != 0, ("vm_page_unref: counter underflow for page %p", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, ("vm_page_unref: missing ref on fictitious page %p", m)); if (VPRC_WIRE_COUNT(old) > 1) return (false); vm_wire_sub(1); return (true); } /* * Ensure that the page is in the specified page queue. If the page is * active or being moved to the active queue, ensure that its act_count is * at least ACT_INIT but do not otherwise mess with it. Otherwise, ensure that * the page is at the tail of its page queue. * * The page may be wired. The caller should release its wiring reference * before releasing the page lock, otherwise the page daemon may immediately * dequeue the page. * - * In many cases this function's parameters are known at compile-time, so - * it is inlined into its callers so as to allow constant folding to remove - * branches. - * * A managed page must be locked. */ static __always_inline void -vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) +vm_page_mvqueue(vm_page_t m, const uint8_t nqueue) { - vm_page_astate_t old, new; + vm_page_assert_locked(m); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_mvqueue: page %p is unmanaged", m)); - KASSERT(m->ref_count > 0, - ("vm_page_mvqueue: page %p is missing refs", m)); - KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, - ("vm_page_mvqueue: unexpected queue state flag")); - KASSERT(nflag != PGA_REQUEUE_HEAD || nqueue == PQ_INACTIVE, - ("vm_page_mvqueue: wrong queue %d for PGA_REQUEUE_HEAD", nqueue)); - for (old = vm_page_astate_load(m);;) { - if ((old.flags & PGA_DEQUEUE) != 0) - break; - new = old; - if (nqueue == PQ_ACTIVE) - new.act_count = max(old.act_count, ACT_INIT); - - if (old.queue == nqueue) { - if (nqueue != PQ_ACTIVE) - new.flags |= nflag; - if (new._bits == old._bits) - break; - } else { - new.flags |= nflag; - new.queue = nqueue; - } - if (vm_page_pqstate_commit(m, &old, new)) - break; + if (vm_page_queue(m) != nqueue) { + vm_page_dequeue(m); + vm_page_enqueue(m, nqueue); + } else if (nqueue != PQ_ACTIVE) { + vm_page_requeue(m); } + + if (nqueue == PQ_ACTIVE && m->act_count < ACT_INIT) + m->act_count = ACT_INIT; } /* * Put the specified page on the active list (if appropriate). */ void vm_page_activate(vm_page_t m) { - if ((m->oflags & VPO_UNMANAGED) != 0) + if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); + vm_page_mvqueue(m, PQ_ACTIVE); } /* * Move the specified page to the tail of the inactive queue, or requeue * the page if it is already in the inactive queue. */ void vm_page_deactivate(vm_page_t m) { - if ((m->oflags & VPO_UNMANAGED) != 0) + if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); + vm_page_mvqueue(m, PQ_INACTIVE); } +/* + * Move the specified page close to the head of the inactive queue, + * bypassing LRU. A marker page is used to maintain FIFO ordering. + * As with regular enqueues, we use a per-CPU batch queue to reduce + * contention on the page queue lock. + */ +static void +_vm_page_deactivate_noreuse(vm_page_t m) +{ + + vm_page_assert_locked(m); + + if (!vm_page_inactive(m)) { + vm_page_dequeue(m); + m->queue = PQ_INACTIVE; + } + if ((m->aflags & PGA_REQUEUE_HEAD) == 0) + vm_page_aflag_set(m, PGA_REQUEUE_HEAD); + vm_page_pqbatch_submit(m, PQ_INACTIVE); +} + void vm_page_deactivate_noreuse(vm_page_t m) { KASSERT(m->object != NULL, ("vm_page_deactivate_noreuse: page %p has no object", m)); - if ((m->oflags & VPO_UNMANAGED) != 0) - return; - vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); + if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_wired(m)) + _vm_page_deactivate_noreuse(m); } /* * Put a page in the laundry, or requeue it if it is already there. */ void vm_page_launder(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; - vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); + vm_page_mvqueue(m, PQ_LAUNDRY); } /* * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { vm_page_assert_locked(m); KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); vm_page_dequeue(m); vm_page_enqueue(m, PQ_UNSWAPPABLE); } -/* XXX comment */ -static bool -vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse) +static void +vm_page_release_toq(vm_page_t m, int flags) { - vm_page_astate_t old, new; - uint16_t nflag; - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("vm_page_release_toq: page %p is unmanaged", m)); - KASSERT(m->ref_count > 0, - ("vm_page_release_toq: page %p is missing refs", m)); + vm_page_assert_locked(m); /* * Use a check of the valid bits to determine whether we should * accelerate reclamation of the page. The object lock might not be * held here, in which case the check is racy. At worst we will either * accelerate reclamation of a valid page and violate LRU, or * unnecessarily defer reclamation of an invalid page. * * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ - nflag = (noreuse || m->valid == 0) ? PGA_REQUEUE_HEAD : PGA_REQUEUE; - - /* XXX explain */ - vm_page_aflag_clear(m, PGA_DEQUEUE); - - for (old = vm_page_astate_load(m);;) { - new = old; - if ((new.flags & PGA_DEQUEUE) != 0) - return (false); - if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) { - new.flags |= PGA_REFERENCED; - } else { - if (nqueue == PQ_ACTIVE) - new.act_count = max(old.act_count, ACT_INIT); - else - new.flags |= nflag; - new.queue = nqueue; - } - - /* - * If the page queue state is not changing, we have nothing - * to do. - */ - if (new._bits == old._bits) - break; - if (vm_page_pqstate_commit(m, &old, new)) - break; - } - return (true); + if ((flags & (VPR_TRYFREE | VPR_NOREUSE)) != 0 || m->valid == 0) + _vm_page_deactivate_noreuse(m); + else if (vm_page_active(m)) + vm_page_reference(m); + else + vm_page_mvqueue(m, PQ_INACTIVE); } /* * Unwire a page and either attempt to free it or re-add it to the page queues. */ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; + u_int old; + bool locked; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); if ((flags & VPR_TRYFREE) != 0) { for (;;) { object = (vm_object_t)atomic_load_ptr(&m->object); if (object == NULL) break; /* Depends on type-stability. */ if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) { object = NULL; break; } if (object == m->object) break; VM_OBJECT_WUNLOCK(object); } if (__predict_true(object != NULL)) { vm_page_release_locked(m, flags); VM_OBJECT_WUNLOCK(object); return; } } - vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); + /* + * Update LRU state before releasing the wiring reference. + * Use a release store when updating the reference count to + * synchronize with vm_page_free_prep(). + */ + old = m->ref_count; + locked = false; + do { + KASSERT(VPRC_WIRE_COUNT(old) > 0, + ("vm_page_unwire: wire count underflow for page %p", m)); + if (!locked && VPRC_WIRE_COUNT(old) == 1) { + vm_page_lock(m); + locked = true; + vm_page_release_toq(m, flags); + } + } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); + + /* + * Release the lock only after the wiring is released, to ensure that + * the page daemon does not encounter and dequeue the page while it is + * still wired. + */ + if (locked) + vm_page_unlock(m); + + if (VPRC_WIRE_COUNT(old) == 1) { + vm_wire_sub(1); + if (old == 1) + vm_page_free(m); + } } /* See vm_page_release(). */ void vm_page_release_locked(vm_page_t m, int flags) { VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release_locked: page %p is unmanaged", m)); if (vm_page_unwire_noq(m)) { if ((flags & VPR_TRYFREE) != 0 && (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && !vm_page_busied(m)) { vm_page_free(m); } else { - (void)vm_page_release_toq(m, PQ_INACTIVE, flags != 0); + vm_page_lock(m); + vm_page_release_toq(m, flags); + vm_page_unlock(m); } } } static bool vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) { u_int old; KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, ("vm_page_try_blocked_op: page %p has no object", m)); KASSERT(!vm_page_busied(m), ("vm_page_try_blocked_op: page %p is busy", m)); VM_OBJECT_ASSERT_LOCKED(m->object); old = m->ref_count; do { KASSERT(old != 0, ("vm_page_try_blocked_op: page %p has no references", m)); if (VPRC_WIRE_COUNT(old) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); (op)(m); /* * If the object is read-locked, new wirings may be created via an * object lookup. */ old = vm_page_drop(m, VPRC_BLOCKED); KASSERT(!VM_OBJECT_WOWNED(m->object) || old == (VPRC_BLOCKED | VPRC_OBJREF), ("vm_page_try_blocked_op: unexpected refcount value %u for %p", old, m)); return (true); } /* * Atomically check for wirings and remove all mappings of the page. */ bool vm_page_try_remove_all(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_all)); } /* * Atomically check for wirings and remove all writeable mappings of the page. */ bool vm_page_try_remove_write(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_write)); } /* * vm_page_advise * * Apply the specified advice to the given page. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else if (!vm_page_in_laundry(m)) vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; int pflags; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ if ((allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_aflag_set(m, PGA_REFERENCED); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (NULL); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); else if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); m = vm_page_alloc(object, pindex, pflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Grab a page and make it valid, paging in if necessary. Pages missing from * their pager are zero filled and validated. */ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; bool sleep, xbusy; int pflags; int rv; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); VM_OBJECT_ASSERT_WLOCKED(object); pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY); pflags |= VM_ALLOC_WAITFAIL; retrylookup: xbusy = false; if ((m = vm_page_lookup(object, pindex)) != NULL) { /* * If the page is fully valid it can only become invalid * with the object lock held. If it is not valid it can * become valid with the busy lock held. Therefore, we * may unnecessarily lock the exclusive busy here if we * race with I/O completion not using the object lock. * However, we will not end up with an invalid page and a * shared lock. */ if (m->valid != VM_PAGE_BITS_ALL || (allocflags & (VM_ALLOC_IGN_SBUSY | VM_ALLOC_SBUSY)) == 0) { sleep = !vm_page_tryxbusy(m); xbusy = true; } else sleep = !vm_page_trysbusy(m); if (sleep) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ if ((allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_aflag_set(m, PGA_REFERENCED); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } if ((allocflags & VM_ALLOC_NOCREAT) != 0 && m->valid != VM_PAGE_BITS_ALL) { if (xbusy) vm_page_xunbusy(m); else vm_page_sunbusy(m); *mp = NULL; return (VM_PAGER_FAIL); } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if (m->valid == VM_PAGE_BITS_ALL) goto out; } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } else if ((m = vm_page_alloc(object, pindex, pflags)) != NULL) { xbusy = true; } else { goto retrylookup; } vm_page_assert_xbusied(m); MPASS(xbusy); if (vm_pager_has_page(object, pindex, NULL, NULL)) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { if (allocflags & VM_ALLOC_WIRED) vm_page_unwire_noq(m); vm_page_free(m); *mp = NULL; return (rv); } MPASS(m->valid == VM_PAGE_BITS_ALL); } else { vm_page_zero_invalid(m, TRUE); } out: if ((allocflags & VM_ALLOC_NOBUSY) != 0) { if (xbusy) vm_page_xunbusy(m); else vm_page_sunbusy(m); } if ((allocflags & VM_ALLOC_SBUSY) != 0 && xbusy) vm_page_busy_downgrade(m); *mp = m; return (VM_PAGER_OK); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; bool sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab_pages: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch")); if (count == 0) return (0); pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) break; /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ if ((allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_aflag_set(m, PGA_REFERENCED); vm_page_busy_sleep(m, "grbmaw", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); } else { if ((allocflags & VM_ALLOC_NOCREAT) != 0) break; m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) break; goto retrylookup; } } if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void -vm_page_pagequeue_lock_assert(vm_page_t m, uint8_t queue) -{ - - if ((m->flags & PG_MARKER) != 0) - return; - - /* - * The page's page queue index may only change while the - * current queue's lock is held. - */ - KASSERT(queue != PQ_NONE, - ("page %p does not belong to a queue", m)); - vm_pagequeue_assert_locked(_vm_page_pagequeue(m, queue)); -} - -void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; virt = strchr(modif, 'v') != NULL; if (virt) m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref %u\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, - m->astate.queue, m->ref_count, m->astate.flags, m->oflags, - m->flags, m->astate.act_count, m->busy_lock, m->valid, m->dirty); + m->queue, m->ref_count, m->aflags, m->oflags, + m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h (revision 352406) +++ head/sys/vm/vm_page.h (revision 352407) @@ -1,925 +1,932 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several collections: * * A radix tree used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * In general, operations on this structure's mutable fields are * synchronized using either one of or a combination of the lock on the * object that the page belongs to (O), the page lock (P), * the per-domain lock for the free queues (F), or the page's queue * lock (Q). The physical address of a page is used to select its page * lock from a pool. The queue lock for a page depends on the value of * its queue field and described in detail below. If a field is * annotated below with two of these locks, then holding either lock is * sufficient for read access, but both locks are required for write * access. An annotation of (C) indicates that the field is immutable. * * In contrast, the synchronization of accesses to the page's * dirty field is machine dependent (M). In the * machine-independent layer, the lock on the object that the * page belongs to must be held in order to operate on the field. * However, the pmap layer is permitted to set all bits within * the field without holding that lock. If the underlying * architecture does not support atomic read-modify-write * operations on the field's type, then the machine-independent * layer uses a 32-bit atomic on the aligned 32-bit word that * contains the dirty field. In the machine-independent layer, * the implementation of read-modify-write operations on the * field is encapsulated in vm_page_clear_dirty_mask(). * * The ref_count field tracks references to the page. References that * prevent the page from being reclaimable are called wirings and are * counted in the low bits of ref_count. The containing object's * reference, if one exists, is counted using the VPRC_OBJREF bit in the * ref_count field. Additionally, the VPRC_BLOCKED bit is used to * atomically check for wirings and prevent new wirings via * pmap_extract_and_hold(). When a page belongs to an object, it may be * wired only when the object is locked, or the page is busy, or by * pmap_extract_and_hold(). As a result, if the object is locked and the * page is not busy (or is exclusively busied by the current thread), and * the page is unmapped, its wire count will not increase. The ref_count * field is updated using atomic operations in most cases, except when it * is known that no other references to the page exist, such as in the page * allocator. A page may be present in the page queues, or even actively * scanned by the page daemon, without an explicitly counted referenced. * The page daemon must therefore handle the possibility of a concurrent * free of the page. * * The busy lock is an embedded reader-writer lock which protects the * page's contents and identity (i.e., its tuple) and * interlocks with the object lock (O). In particular, a page may be * busied or unbusied only with the object write lock held. To avoid * bloating the page structure, the busy lock lacks some of the * features available to the kernel's general-purpose synchronization * primitives. As a result, busy lock ordering rules are not verified, * lock recursion is not detected, and an attempt to xbusy a busy page * or sbusy an xbusy page results will trigger a panic rather than * causing the thread to block. vm_page_sleep_if_busy() can be used to * sleep until the page's busy state changes, after which the caller * must re-lookup the page and re-evaluate its state. * * The queue field is the index of the page queue containing the page, * or PQ_NONE if the page is not enqueued. The queue lock of a page is * the page queue lock corresponding to the page queue index, or the * page lock (P) for the page if it is not enqueued. To modify the * queue field, the queue lock for the old value of the field must be * held. There is one exception to this rule: the page daemon may * transition the queue field from PQ_INACTIVE to PQ_NONE immediately * prior to freeing a page during an inactive queue scan. At that * point the page has already been physically dequeued and no other * references to that vm_page structure exist. * * To avoid contention on page queue locks, page queue operations * (enqueue, dequeue, requeue) are batched using per-CPU queues. A * deferred operation is requested by inserting an entry into a batch * queue; the entry is simply a pointer to the page, and the request * type is encoded in the page's aflags field using the values in * PGA_QUEUE_STATE_MASK. The type-stability of struct vm_pages is * crucial to this scheme since the processing of entries in a given * batch queue may be deferred indefinitely. In particular, a page may * be freed before its pending batch queue entries have been processed. * The page lock (P) must be held to schedule a batched queue * operation, and the page queue lock must be held in order to process * batch queue entries for the page queue. There is one exception to * this rule: the thread freeing a page may schedule a dequeue without * holding the page lock. In this scenario the only other thread which * may hold a reference to the page is the page daemon, which is * careful to avoid modifying the page's queue state once the dequeue * has been requested by setting PGA_DEQUEUE. */ #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xffu typedef uint8_t vm_page_bits_t; #elif PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffffu typedef uint16_t vm_page_bits_t; #elif PAGE_SIZE == 16384 #define VM_PAGE_BITS_ALL 0xffffffffu typedef uint32_t vm_page_bits_t; #elif PAGE_SIZE == 32768 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu typedef uint64_t vm_page_bits_t; #endif -typedef union { - struct { - uint16_t flags; - uint8_t queue; - uint8_t act_count; - }; - uint32_t _bits; -} vm_page_astate_t; - struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ struct { SLIST_ENTRY(vm_page) ss; /* private slists */ void *pv; } s; struct { u_long p; u_long v; } memguard; } plinks; TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page (C) */ struct md_page md; /* machine dependent stuff */ union { u_int wire_count; u_int ref_count; /* page references */ }; volatile u_int busy_lock; /* busy owners lock */ - vm_page_astate_t astate; /* atomically updated state */ - uint8_t flags; /* page PG_* flags (P) */ + uint16_t flags; /* page PG_* flags (P) */ uint8_t order; /* index of the buddy queue (F) */ uint8_t pool; /* vm_phys freepool index (F) */ + uint8_t aflags; /* access is atomic */ + uint8_t oflags; /* page VPO_* flags (O) */ + uint8_t queue; /* page queue index (Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; /* vm_phys segment index (C) */ - uint8_t oflags; /* page VPO_* flags (O) */ + u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */ }; /* * Special bits used in the ref_count field. * * ref_count is normally used to count wirings that prevent the page from being * reclaimed, but also supports several special types of references that do not * prevent reclamation. Accesses to the ref_count field must be atomic unless * the page is unallocated. * * VPRC_OBJREF is the reference held by the containing object. It can set or * cleared only when the corresponding object's write lock is held. * * VPRC_BLOCKED is used to atomically block wirings via pmap lookups while * attempting to tear down all mappings of a given page. The page lock and * object write lock must both be held in order to set or clear this bit. */ #define VPRC_BLOCKED 0x40000000u /* mappings are being removed */ #define VPRC_OBJREF 0x80000000u /* object reference, cleared with (O) */ #define VPRC_WIRE_COUNT(c) ((c) & ~(VPRC_BLOCKED | VPRC_OBJREF)) #define VPRC_WIRE_COUNT_MAX (~(VPRC_BLOCKED | VPRC_OBJREF)) /* * Page flags stored in oflags: * * Access to these page flags is synchronized by the lock on the object * containing the page (O). * * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG) * indicates that the page is not under PV management but * otherwise should be treated as a normal page. Pages not * under PV management cannot be paged out via the * object/vm_page_t because there is no knowledge of their pte * mappings, and such pages are also not on any PQ queue. * */ #define VPO_KMEM_EXEC 0x01 /* kmem mapping allows execution */ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ #define VPO_NOSYNC 0x10 /* do not collect for syncer */ /* * Busy page implementation details. * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation, * even if the support for owner identity is removed because of size * constraints. Checks on lock recursion are then not possible, while the * lock assertions effectiveness is someway reduced. */ #define VPB_BIT_SHARED 0x01 #define VPB_BIT_EXCLUSIVE 0x02 #define VPB_BIT_WAITERS 0x04 #define VPB_BIT_FLAGMASK \ (VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS) #define VPB_SHARERS_SHIFT 3 #define VPB_SHARERS(x) \ (((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT) #define VPB_SHARERS_WORD(x) ((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED) #define VPB_ONE_SHARER (1 << VPB_SHARERS_SHIFT) #define VPB_SINGLE_EXCLUSIVER VPB_BIT_EXCLUSIVE #define VPB_UNBUSIED VPB_SHARERS_WORD(0) #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 #define PQ_LAUNDRY 2 #define PQ_UNSWAPPABLE 3 #define PQ_COUNT 4 #ifndef VM_PAGE_HAVE_PGLIST TAILQ_HEAD(pglist, vm_page); #define VM_PAGE_HAVE_PGLIST #endif SLIST_HEAD(spglist, vm_page); #ifdef _KERNEL extern vm_page_t bogus_page; #endif /* _KERNEL */ extern struct mtx_padalign pa_lock[]; #if defined(__arm__) #define PDRSHIFT PDR_SHIFT #elif !defined(PDRSHIFT) #define PDRSHIFT 21 #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define PA_LOCKPTR(pa) ((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT])) #define PA_LOCKOBJPTR(pa) ((struct lock_object *)PA_LOCKPTR((pa))) #define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa)) #define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa)) #define PA_UNLOCK(pa) mtx_unlock(PA_LOCKPTR(pa)) #define PA_UNLOCK_COND(pa) \ do { \ if ((pa) != 0) { \ PA_UNLOCK((pa)); \ (pa) = 0; \ } \ } while (0) #define PA_LOCK_ASSERT(pa, a) mtx_assert(PA_LOCKPTR(pa), (a)) #if defined(KLD_MODULE) && !defined(KLD_TIED) #define vm_page_lock(m) vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_unlock(m) vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_trylock(m) vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE) #else /* !KLD_MODULE */ #define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m)))) #define vm_page_lock(m) mtx_lock(vm_page_lockptr((m))) #define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m))) #define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m))) #endif #if defined(INVARIANTS) #define vm_page_assert_locked(m) \ vm_page_assert_locked_KBI((m), __FILE__, __LINE__) #define vm_page_lock_assert(m, a) \ vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__) #else #define vm_page_assert_locked(m) #define vm_page_lock_assert(m, a) #endif /* * The vm_page's aflags are updated using atomic operations. To set or clear * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear() * must be used. Neither these flags nor these functions are part of the KBI. * * PGA_REFERENCED may be cleared only if the page is locked. It is set by * both the MI and MD VM layers. However, kernel loadable modules should not * directly set this flag. They should call vm_page_reference() instead. * * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter(). * When it does so, the object must be locked, or the page must be * exclusive busied. The MI VM layer must never access this flag * directly. Instead, it should call pmap_page_is_write_mapped(). * * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has * at least one executable mapping. It is not consumed by the MI VM layer. * * PGA_ENQUEUED is set and cleared when a page is inserted into or removed * from a page queue, respectively. It determines whether the plinks.q field * of the page is valid. To set or clear this flag, the queue lock for the * page must be held: the page queue lock corresponding to the page's "queue" * field if its value is not PQ_NONE, and the page lock otherwise. * * PGA_DEQUEUE is set when the page is scheduled to be dequeued from a page * queue, and cleared when the dequeue request is processed. A page may * have PGA_DEQUEUE set and PGA_ENQUEUED cleared, for instance if a dequeue * is requested after the page is scheduled to be enqueued but before it is * actually inserted into the page queue. For allocated pages, the page lock * must be held to set this flag, but it may be set by vm_page_free_prep() * without the page lock held. The page queue lock must be held to clear the * PGA_DEQUEUE flag. * * PGA_REQUEUE is set when the page is scheduled to be enqueued or requeued * in its page queue. The page lock must be held to set this flag, and the * queue lock for the page must be held to clear it. * * PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of * the inactive queue, thus bypassing LRU. The page lock must be held to * set this flag, and the queue lock for the page must be held to clear it. */ #define PGA_WRITEABLE 0x01 /* page may be mapped writeable */ #define PGA_REFERENCED 0x02 /* page has been referenced */ #define PGA_EXECUTABLE 0x04 /* page may be mapped executable */ #define PGA_ENQUEUED 0x08 /* page is enqueued in a page queue */ #define PGA_DEQUEUE 0x10 /* page is due to be dequeued */ #define PGA_REQUEUE 0x20 /* page is due to be requeued */ #define PGA_REQUEUE_HEAD 0x40 /* page requeue should bypass LRU */ -#define PGA_QUEUE_OP_MASK (PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) -#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_QUEUE_OP_MASK) +#define PGA_QUEUE_STATE_MASK (PGA_ENQUEUED | PGA_DEQUEUE | PGA_REQUEUE | \ + PGA_REQUEUE_HEAD) /* * Page flags. If changed at any other time than page allocation or * freeing, the modification must be protected by the vm_page lock. * * The PG_PCPU_CACHE flag is set at allocation time if the page was * allocated from a per-CPU cache. It is cleared the next time that the * page is allocated from the physical memory allocator. */ -#define PG_PCPU_CACHE 0x01 /* was allocated from per-CPU caches */ -#define PG_FICTITIOUS 0x04 /* physical page doesn't exist */ -#define PG_ZERO 0x08 /* page is zeroed */ -#define PG_MARKER 0x10 /* special queue marker page */ -#define PG_NODUMP 0x80 /* don't include this page in a dump */ +#define PG_PCPU_CACHE 0x0001 /* was allocated from per-CPU caches */ +#define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ +#define PG_ZERO 0x0008 /* page is zeroed */ +#define PG_MARKER 0x0010 /* special queue marker page */ +#define PG_NODUMP 0x0080 /* don't include this page in a dump */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #ifdef _KERNEL #include #include /* * Each pageable resident page falls into one of five lists: * * free * Available for allocation now. * * inactive * Low activity, candidates for reclamation. * This list is approximately LRU ordered. * * laundry * This is the list of pages that should be * paged out next. * * unswappable * Dirty anonymous pages that cannot be paged * out because no swap device is configured. * * active * Pages that are "active", i.e., they have been * recently referenced. * */ extern vm_page_t vm_page_array; /* First resident page in table */ extern long vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) /* * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory * page to which the given physical address belongs. The correct vm_page_t * object is returned for addresses that are not page-aligned. */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); /* * Page allocation parameters for vm_page for the functions * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and * vm_page_alloc_freelist(). Some functions support only a subset * of the flags, and ignore others, see the flags legend. * * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*() * and the vm_page_grab*() functions. See these functions for details. * * Bits 0 - 1 define class. * Bits 2 - 15 dedicated for flags. * Legend: * (a) - vm_page_alloc() supports the flag. * (c) - vm_page_alloc_contig() supports the flag. * (f) - vm_page_alloc_freelist() supports the flag. * (g) - vm_page_grab() supports the flag. * (p) - vm_page_grab_pages() supports the flag. * Bits above 15 define the count of additional pages that the caller * intends to allocate. */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 #define VM_ALLOC_WAITOK 0x0008 /* (acf) Sleep and retry */ #define VM_ALLOC_WAITFAIL 0x0010 /* (acf) Sleep and return error */ #define VM_ALLOC_WIRED 0x0020 /* (acfgp) Allocate a wired page */ #define VM_ALLOC_ZERO 0x0040 /* (acfgp) Allocate a prezeroed page */ #define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* (acgp) Do not excl busy the page */ #define VM_ALLOC_NOCREAT 0x0400 /* (gp) Don't create a page */ #define VM_ALLOC_IGN_SBUSY 0x1000 /* (gp) Ignore shared busy flag */ #define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ #define VM_ALLOC_SBUSY 0x4000 /* (acgp) Shared busy the page */ #define VM_ALLOC_NOWAIT 0x8000 /* (acfgp) Do not sleep */ #define VM_ALLOC_COUNT_SHIFT 16 #define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT) #ifdef M_NOWAIT static inline int malloc2vm_flags(int malloc_flags) { int pflags; KASSERT((malloc_flags & M_USE_RESERVE) == 0 || (malloc_flags & M_NOWAIT) != 0, ("M_USE_RESERVE requires M_NOWAIT")); pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM; if ((malloc_flags & M_ZERO) != 0) pflags |= VM_ALLOC_ZERO; if ((malloc_flags & M_NODUMP) != 0) pflags |= VM_ALLOC_NODUMP; if ((malloc_flags & M_NOWAIT)) pflags |= VM_ALLOC_NOWAIT; if ((malloc_flags & M_WAITOK)) pflags |= VM_ALLOC_WAITOK; return (pflags); } #endif /* * Predicates supported by vm_page_ps_test(): * * PS_ALL_DIRTY is true only if the entire (super)page is dirty. * However, it can be spuriously false when the (super)page has become * dirty in the pmap but that information has not been propagated to the * machine-independent layer. */ #define PS_ALL_DIRTY 0x1 #define PS_ALL_VALID 0x2 #define PS_NONE_BUSY 0x4 int vm_page_busy_acquire(vm_page_t m, int allocflags); void vm_page_busy_downgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); void vm_page_activate (vm_page_t); void vm_page_advise(vm_page_t m, int advice); vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int); vm_page_t vm_page_alloc_domain(vm_object_t, vm_pindex_t, int, int); vm_page_t vm_page_alloc_after(vm_object_t, vm_pindex_t, int, vm_page_t); vm_page_t vm_page_alloc_domain_after(vm_object_t, vm_pindex_t, int, int, vm_page_t); vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_alloc_freelist_domain(int, int, int); bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose); void vm_page_change_lock(vm_page_t m, struct mtx **mtx); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count); int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags); void vm_page_deactivate(vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); +void vm_page_dequeue_deferred(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); bool vm_page_free_prep(vm_page_t m); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); void vm_page_pqbatch_drain(void); void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue); -bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, - vm_page_astate_t new); vm_page_t vm_page_prev(vm_page_t m); bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); #define VPR_TRYFREE 0x01 #define VPR_NOREUSE 0x02 void vm_page_release(vm_page_t m, int flags); void vm_page_release_locked(vm_page_t m, int flags); bool vm_page_remove(vm_page_t); int vm_page_rename(vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex); void vm_page_requeue(vm_page_t m); int vm_page_sbusied(vm_page_t m); vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); void vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq); bool vm_page_try_remove_all(vm_page_t m); bool vm_page_try_remove_write(vm_page_t m); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); void vm_page_unswappable(vm_page_t m); void vm_page_unwire(vm_page_t m, uint8_t queue); bool vm_page_unwire_noq(vm_page_t m); void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire(vm_page_t); bool vm_page_wire_mapped(vm_page_t m); void vm_page_xunbusy_hard(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int); int vm_page_is_valid (vm_page_t, int, int); void vm_page_test_dirty (vm_page_t); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count); void vm_page_dirty_KBI(vm_page_t m); void vm_page_lock_KBI(vm_page_t m, const char *file, int line); void vm_page_unlock_KBI(vm_page_t m, const char *file, int line); int vm_page_trylock_KBI(vm_page_t m, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line); void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #endif #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_unbusied(m) \ KASSERT(!vm_page_busied(m), \ ("vm_page_assert_unbusied: page %p busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_xbusied(m) \ KASSERT(vm_page_xbusied(m), \ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_busied(m) \ ((m)->busy_lock != VPB_UNBUSIED) #define vm_page_sbusy(m) do { \ if (!vm_page_trysbusy(m)) \ panic("%s: page %p failed shared busying", __func__, \ (m)); \ } while (0) #define vm_page_tryxbusy(m) \ (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \ VPB_SINGLE_EXCLUSIVER)) #define vm_page_xbusied(m) \ (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) #define vm_page_xbusy(m) do { \ if (!vm_page_tryxbusy(m)) \ panic("%s: page %p failed exclusive busying", __func__, \ (m)); \ } while (0) /* Note: page m's lock must not be owned by the caller. */ #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \ vm_page_xunbusy_hard(m); \ } while (0) #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m); #define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) -void vm_page_pagequeue_lock_assert(vm_page_t m, uint8_t queue); -#define VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, q) vm_page_pagequeue_lock_assert(m, q) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else #define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 -#define VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, q) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif /* - * We want to use atomic updates for the aflags field, which is 16 bits wide. - * However, not all architectures support atomic operations on 16-bit + * We want to use atomic updates for the aflags field, which is 8 bits wide. + * However, not all architectures support atomic operations on 8-bit * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ -_Static_assert(offsetof(struct vm_page, astate.flags) % sizeof(uint32_t) == 0, +_Static_assert(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0, "aflags field is not 32-bit aligned"); -#define VM_PAGE_AFLAG_SHIFT __offsetof(vm_page_astate_t, flags) - /* - * Return the atomic flag set for the page. + * We want to be able to update the aflags and queue fields atomically in + * the same operation. */ -static inline int -vm_page_aflags(vm_page_t m) -{ +_Static_assert(offsetof(struct vm_page, aflags) / sizeof(uint32_t) == + offsetof(struct vm_page, queue) / sizeof(uint32_t), + "aflags and queue fields do not belong to the same 32-bit word"); +_Static_assert(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2, + "queue field is at an unexpected offset"); +_Static_assert(sizeof(((struct vm_page *)NULL)->queue) == 1, + "queue field has an unexpected size"); - return (m->astate.flags); -} +#if BYTE_ORDER == LITTLE_ENDIAN +#define VM_PAGE_AFLAG_SHIFT 0 +#define VM_PAGE_QUEUE_SHIFT 16 +#else +#define VM_PAGE_AFLAG_SHIFT 24 +#define VM_PAGE_QUEUE_SHIFT 8 +#endif +#define VM_PAGE_QUEUE_MASK (0xff << VM_PAGE_QUEUE_SHIFT) /* * Clear the given bits in the specified page. */ static inline void -vm_page_aflag_clear(vm_page_t m, uint16_t bits) +vm_page_aflag_clear(vm_page_t m, uint8_t bits) { uint32_t *addr, val; /* + * The PGA_REFERENCED flag can only be cleared if the page is locked. + */ + if ((bits & PGA_REFERENCED) != 0) + vm_page_assert_locked(m); + + /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ - addr = (void *)&m->astate; + addr = (void *)&m->aflags; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_clear_32(addr, val); } /* * Set the given bits in the specified page. */ static inline void -vm_page_aflag_set(vm_page_t m, uint16_t bits) +vm_page_aflag_set(vm_page_t m, uint8_t bits) { uint32_t *addr, val; VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ - addr = (void *)&m->astate; + addr = (void *)&m->aflags; val = bits << VM_PAGE_AFLAG_SHIFT; atomic_set_32(addr, val); } -static inline vm_page_astate_t -vm_page_astate_load(vm_page_t m) -{ - vm_page_astate_t astate; - - astate._bits = atomic_load_32(&m->astate); - return (astate); -} - +/* + * Atomically update the queue state of the page. The operation fails if + * any of the queue flags in "fflags" are set or if the "queue" field of + * the page does not match the expected value; if the operation is + * successful, the flags in "nflags" are set and all other queue state + * flags are cleared. + */ static inline bool -vm_page_astate_fcmpset(vm_page_t m, vm_page_astate_t *old, - vm_page_astate_t new) +vm_page_pqstate_cmpset(vm_page_t m, uint32_t oldq, uint32_t newq, + uint32_t fflags, uint32_t nflags) { - int ret; + uint32_t *addr, nval, oval, qsmask; - KASSERT(new.queue == PQ_INACTIVE || (new.flags & PGA_REQUEUE_HEAD) == 0, - ("vm_page_astate_fcmpset: unexecpted head requeue for page %p", - m)); - KASSERT((new.flags & PGA_ENQUEUED) == 0 || new.queue != PQ_NONE, - ("vm_page_astate_fcmpset: setting PGA_ENQUEUED without a queue")); - KASSERT(new._bits != old->_bits, - ("vm_page_astate_fcmpset: bits are not changing")); + vm_page_assert_locked(m); - ret = atomic_fcmpset_32(&m->astate._bits, &old->_bits, new._bits); - if (ret != 0) { - if (old->queue != PQ_NONE && old->queue != new.queue) - VM_PAGE_PAGEQUEUE_LOCK_ASSERT(m, old->queue); - KASSERT((new.flags & PGA_ENQUEUED) == 0 || old->queue == new.queue, - ("vm_page_astate_fcmpset: PGA_ENQUEUED set after queue change for page %p", m)); - } + fflags <<= VM_PAGE_AFLAG_SHIFT; + nflags <<= VM_PAGE_AFLAG_SHIFT; + newq <<= VM_PAGE_QUEUE_SHIFT; + oldq <<= VM_PAGE_QUEUE_SHIFT; + qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) << + VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK; - return (ret != 0); + addr = (void *)&m->aflags; + oval = atomic_load_32(addr); + do { + if ((oval & fflags) != 0) + return (false); + if ((oval & VM_PAGE_QUEUE_MASK) != oldq) + return (false); + nval = (oval & ~qsmask) | nflags | newq; + } while (!atomic_fcmpset_32(addr, &oval, nval)); + + return (true); } /* * vm_page_dirty: * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). */ static __inline void vm_page_dirty(vm_page_t m) { /* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */ #if (defined(KLD_MODULE) && !defined(KLD_TIED)) || defined(INVARIANTS) vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; #endif } /* * vm_page_undirty: * * Set page to not be dirty. Note: does not clear pmap modify bits */ static __inline void vm_page_undirty(vm_page_t m) { VM_PAGE_OBJECT_LOCK_ASSERT(m); m->dirty = 0; } static inline void vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret; mret = vm_page_replace(mnew, object, pindex); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); /* Unused if !INVARIANTS. */ (void)mold; (void)mret; } /* * vm_page_queue: * - * Return the index of the queue containing m. + * Return the index of the queue containing m. This index is guaranteed + * not to change while the page lock is held. */ static inline uint8_t vm_page_queue(vm_page_t m) { - vm_page_astate_t as; - as = vm_page_astate_load(m); - if ((as.flags & PGA_DEQUEUE) != 0) + vm_page_assert_locked(m); + + if ((m->aflags & PGA_DEQUEUE) != 0) return (PQ_NONE); - return (as.queue); + atomic_thread_fence_acq(); + return (m->queue); } static inline bool vm_page_active(vm_page_t m) { return (vm_page_queue(m) == PQ_ACTIVE); } static inline bool vm_page_inactive(vm_page_t m) { return (vm_page_queue(m) == PQ_INACTIVE); } static inline bool vm_page_in_laundry(vm_page_t m) { uint8_t queue; queue = vm_page_queue(m); return (queue == PQ_LAUNDRY || queue == PQ_UNSWAPPABLE); } /* * vm_page_drop: * * Release a reference to a page and return the old reference count. */ static inline u_int vm_page_drop(vm_page_t m, u_int val) { /* * Synchronize with vm_page_free_prep(): ensure that all updates to the * page structure are visible before it is freed. */ atomic_thread_fence_rel(); return (atomic_fetchadd_int(&m->ref_count, -val)); } /* * vm_page_wired: * * Perform a racy check to determine whether a reference prevents the page * from being reclaimable. If the page's object is locked, and the page is * unmapped and unbusied or exclusively busied by the current thread, no * new wirings may be created. */ static inline bool vm_page_wired(vm_page_t m) { return (VPRC_WIRE_COUNT(m->ref_count) > 0); } #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 352406) +++ head/sys/vm/vm_pageout.c (revision 352407) @@ -1,2185 +1,2217 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); /* Pagedaemon activity rates, in subdivisions of one second. */ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 10 static int vm_pageout_oom_seq = 12; static int vm_pageout_update_period; static int disable_swap_pageouts; static int lowmem_period = 10; static int swapdev_enabled; static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); static int act_scan_laundry_weight = 3; SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, "background laundering rate, in kilobytes per second"); static u_int vm_background_launder_max = 20 * 1024; SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); int vm_pageout_page_count = 32; u_long vm_page_max_user_wired; SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW, &vm_page_max_user_wired, 0, "system-wide limit to user-wired page count"); static u_int isqrt(u_int num); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); struct scan_state { struct vm_batchqueue bq; struct vm_pagequeue *pq; vm_page_t marker; int maxscan; int scanned; }; static void vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, vm_page_t marker, vm_page_t after, int maxscan) { vm_pagequeue_assert_locked(pq); - KASSERT((vm_page_aflags(marker) & PGA_ENQUEUED) == 0, + KASSERT((marker->aflags & PGA_ENQUEUED) == 0, ("marker %p already enqueued", marker)); if (after == NULL) TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q); else TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q); vm_page_aflag_set(marker, PGA_ENQUEUED); vm_batchqueue_init(&ss->bq); ss->pq = pq; ss->marker = marker; ss->maxscan = maxscan; ss->scanned = 0; vm_pagequeue_unlock(pq); } static void vm_pageout_end_scan(struct scan_state *ss) { struct vm_pagequeue *pq; pq = ss->pq; vm_pagequeue_assert_locked(pq); - KASSERT((vm_page_aflags(ss->marker) & PGA_ENQUEUED) != 0, + KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); vm_page_aflag_clear(ss->marker, PGA_ENQUEUED); pq->pq_pdpages += ss->scanned; } /* * Add a small number of queued pages to a batch queue for later processing * without the corresponding queue lock held. The caller must have enqueued a * marker page at the desired start point for the scan. Pages will be * physically dequeued if the caller so requests. Otherwise, the returned * batch may contain marker pages, and it is up to the caller to handle them. * * When processing the batch queue, vm_page_queue() must be used to * determine whether the page has been logically dequeued by another thread. * Once this check is performed, the page lock guarantees that the page will * not be disassociated from the queue. */ static __always_inline void vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) { struct vm_pagequeue *pq; vm_page_t m, marker, n; marker = ss->marker; pq = ss->pq; - KASSERT((marker->astate.flags & PGA_ENQUEUED) != 0, + KASSERT((marker->aflags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); vm_pagequeue_lock(pq); for (m = TAILQ_NEXT(marker, plinks.q); m != NULL && ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE; m = n, ss->scanned++) { n = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) == 0) { - KASSERT((m->astate.flags & PGA_ENQUEUED) != 0, + KASSERT((m->aflags & PGA_ENQUEUED) != 0, ("page %p not enqueued", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in page queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in page queue", m)); } else if (dequeue) continue; (void)vm_batchqueue_insert(&ss->bq, m); if (dequeue) { TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_page_aflag_clear(m, PGA_ENQUEUED); } } TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q); if (__predict_true(m != NULL)) TAILQ_INSERT_BEFORE(m, marker, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q); if (dequeue) vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt); vm_pagequeue_unlock(pq); } /* * Return the next page to be scanned, or NULL if the scan is complete. */ static __always_inline vm_page_t vm_pageout_next(struct scan_state *ss, const bool dequeue) { if (ss->bq.bq_cnt == 0) vm_pageout_collect_batch(ss, dequeue); return (vm_batchqueue_pop(&ss->bq)); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; vm_page_assert_unbusied(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p) || vm_page_wired(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } + vm_page_lock(p); if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { + vm_page_unlock(p); ib = 0; break; } + vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p) || vm_page_wired(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; - if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) + vm_page_lock(p); + if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { + vm_page_unlock(p); break; + } + vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Mark the pages busy and verify that they're valid * and read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); - KASSERT((vm_page_aflags(mc[i]) & PGA_WRITEABLE) == 0, + KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_sbusy(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: vm_page_lock(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); vm_page_unlock(mt); /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * The page is outside the object's range. We pretend * that the page out worked and clean the page, so the * changes will be lost if the page is reclaimed by * the page daemon. */ vm_page_undirty(mt); vm_page_lock(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If the page couldn't be paged out to swap because the * pager wasn't able to find space, place the page in * the PQ_UNSWAPPABLE holding queue. This is an * optimization that prevents the page daemon from * wasting CPU cycles on pages that cannot be reclaimed * becase no swap device is configured. * * Otherwise, reactivate the page so that it doesn't * clog the laundry and inactive queues. (We will try * paging it out again later.) */ vm_page_lock(mt); if (object->type == OBJT_SWAP && pageout_status[i] == VM_PAGER_FAIL) { vm_page_unswappable(mt); numpagedout++; } else vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } static void vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) { atomic_store_rel_int(&swapdev_enabled, 1); } static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) { if (swap_pager_nswapdev() == 1) atomic_store_rel_int(&swapdev_enabled, 0); } /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; + vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { + vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); /* * Ensure that the object and vnode were not disassociated * while locks were dropped. */ if (vp->v_object != object) { error = ENOENT; goto unlock_all; } + vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied while the object and page * locks were released. */ if (vm_page_busied(m)) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * Remove all writeable mappings, failing if the page is wired. */ if (!vm_page_try_remove_write(m)) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } + vm_page_unlock(m); /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * Attempt to launder the specified number of pages. * * Returns the number of pages successfully laundered. */ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct scan_state ss; struct vm_pagequeue *pq; + struct mtx *mtx; vm_object_t object; vm_page_t m, marker; - vm_page_astate_t old, new; - int act_delta, error, numpagedout, queue, refs, starting_target; + int act_delta, error, numpagedout, queue, starting_target; int vnodes_skipped; bool pageout_ok; + mtx = NULL; object = NULL; starting_target = launder; vnodes_skipped = 0; /* * Scan the laundry queues for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once * we've reached the end of the queue. A single iteration of this loop * may cause more than one page to be laundered because of clustering. * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) queue = PQ_UNSWAPPABLE; else queue = PQ_LAUNDRY; scan: marker = &vmd->vmd_markers[queue]; pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false((m->flags & PG_MARKER) != 0)) continue; + vm_page_change_lock(m, &mtx); + +recheck: /* - * Perform some quick and racy checks of the page's queue state. - * Bail if things are not as we expect. + * The page may have been disassociated from the queue + * or even freed while locks were dropped. We thus must be + * careful whenever modifying page state. Once the object lock + * has been acquired, we have a stable reference to the page. */ - old = vm_page_astate_load(m); - if (old.queue != PQ_LAUNDRY || (old.flags & PGA_ENQUEUED) == 0) + if (vm_page_queue(m) != queue) continue; - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { + + /* + * A requeue was requested, so this page gets a second + * chance. + */ + if ((m->aflags & PGA_REQUEUE) != 0) { vm_page_pqbatch_submit(m, queue); continue; } + /* + * Wired pages may not be freed. Complete their removal + * from the queue now to avoid needless revisits during + * future scans. This check is racy and must be reverified once + * we hold the object lock and have verified that the page + * is not busy. + */ + if (vm_page_wired(m)) { + vm_page_dequeue_deferred(m); + continue; + } + if (object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); + + /* + * A page's object pointer may be set to NULL before + * the object lock is acquired. + */ object = (vm_object_t)atomic_load_ptr(&m->object); - if (object == NULL) - continue; - VM_OBJECT_WLOCK(object); - if (m->object != object) { - VM_OBJECT_WUNLOCK(object); - object = NULL; - continue; + if (object != NULL && !VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(mtx); + /* Depends on type-stability. */ + VM_OBJECT_WLOCK(object); + mtx_lock(mtx); + goto recheck; } } + if (__predict_false(m->object == NULL)) + /* + * The page has been removed from its object. + */ + continue; + KASSERT(m->object == object, ("page %p does not belong to %p", + m, object)); if (vm_page_busied(m)) continue; /* - * Check for wirings now that we hold the object lock and have - * verified that the page is unbusied. If the page is mapped, - * it may still be wired by pmap lookups. The call to + * Re-check for wirings now that we hold the object lock and + * have verified that the page is unbusied. If the page is + * mapped, it may still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase. */ if (__predict_false(vm_page_wired(m))) { - vm_page_pqbatch_submit(m, queue); + vm_page_dequeue_deferred(m); continue; } /* * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. * * Test PGA_REFERENCED after calling pmap_ts_referenced() so * that a reference from a concurrently destroyed mapping is * observed here and now. */ - refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + if (object->ref_count != 0) + act_delta = pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + act_delta = 0; + } + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta++; + } + if (act_delta != 0) { + if (object->ref_count != 0) { + VM_CNT_INC(v_reactivated); + vm_page_activate(m); - for (old = vm_page_astate_load(m);;) { - if (old.queue != queue || - (old.flags & PGA_ENQUEUED) == 0) - goto next_page; + /* + * Increase the activation count if the page + * was referenced while in the laundry queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { - vm_page_pqbatch_submit(m, queue); - goto next_page; + /* + * If this was a background laundering, count + * activated pages towards our target. The + * purpose of background laundering is to ensure + * that pages are eventually cycled through the + * laundry queue, and an activation is a valid + * way out. + */ + if (!in_shortfall) + launder--; + continue; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_requeue(m); + continue; } - - new = old; - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - if (act_delta != 0) { - if (object->ref_count != 0) { - /* - * Increase the activation count if the - * page was referenced while in the - * laundry queue. This makes it less - * likely that the page will be returned - * prematurely to the inactive queue. - */ - new.act_count += ACT_ADVANCE + - act_delta; - if (new.act_count > ACT_MAX) - new.act_count = ACT_MAX; - - new.flags |= PGA_REQUEUE; - new.queue = PQ_ACTIVE; - if (!vm_page_pqstate_commit(m, &old, - new)) - continue; - - VM_CNT_INC(v_reactivated); - - /* - * If this was a background laundering, - * count activated pages towards our - * target. The purpose of background - * laundering is to ensure that pages - * are eventually cycled through the - * laundry queue, and an activation is a - * valid way out. - */ - if (!in_shortfall) - launder--; - goto next_page; - } else if ((object->flags & OBJ_DEAD) == 0) { - vm_page_launder(m); - goto next_page; - } - } - break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) { - vm_page_pqbatch_submit(m, queue); + vm_page_dequeue_deferred(m); continue; } } /* * Clean pages are freed, and dirty pages are paged out unless * they belong to a dead object. Requeueing dirty pages from * dead objects is pointless, as they are being paged out and * freed by the thread that destroyed the object. */ if (m->dirty == 0) { free_page: vm_page_free(m); VM_CNT_INC(v_dfree); } else if ((object->flags & OBJ_DEAD) == 0) { if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = true; else if (disable_swap_pageouts) pageout_ok = false; else pageout_ok = true; if (!pageout_ok) { - vm_page_launder(m); + vm_page_requeue(m); continue; } /* * Form a cluster with adjacent, dirty pages from the * same object, and page out that entire cluster. * * The adjacent, dirty pages must also be in the * laundry. However, their mappings are not checked * for new references. Consequently, a recently * referenced page may be paged out. However, that * page will not be prematurely reclaimed. After page * out, the page will be placed in the inactive queue, * where any new references will be detected and the * page reactivated. */ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; ss.scanned += numpagedout; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } + mtx = NULL; object = NULL; } -next_page:; } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; + } if (object != NULL) { VM_OBJECT_WUNLOCK(object); object = NULL; } vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); if (launder > 0 && queue == PQ_UNSWAPPABLE) { queue = PQ_LAUNDRY; goto scan; } /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ if (vnodes_skipped > 0 && launder > 0) (void)speedup_syncer(); return (starting_target - launder); } /* * Compute the integer square root. */ static u_int isqrt(u_int num) { u_int bit, root, tmp; bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0; root = 0; while (bit != 0) { tmp = root + bit; root >>= 1; if (num >= tmp) { num -= tmp; root += bit; } bit >>= 2; } return (root); } /* * Perform the work of the laundry thread: periodically wake up and determine * whether any pages need to be laundered. If so, determine the number of pages * that need to be laundered, and launder them. */ static void vm_pageout_laundry_worker(void *arg) { struct vm_domain *vmd; struct vm_pagequeue *pq; uint64_t nclean, ndirty, nfreed; int domain, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(vmd->vmd_segs != 0, ("domain without segments")); shortfall = 0; in_shortfall = false; shortfall_cycle = 0; last_target = target = 0; nfreed = 0; /* * Calls to these handlers are serialized by the swap syscall lock. */ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd, EVENTHANDLER_PRI_ANY); (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd, EVENTHANDLER_PRI_ANY); /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { KASSERT(target >= 0, ("negative target %d", target)); KASSERT(shortfall_cycle >= 0, ("negative cycle %d", shortfall_cycle)); launder = 0; /* * First determine whether we need to launder pages to meet a * shortage of free pages. */ if (shortfall > 0) { in_shortfall = true; shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run * (and we are no longer in shortfall) or we have met * our laundry target through other activity, then we * can stop laundering pages. */ in_shortfall = false; target = 0; goto trybackground; } launder = target / shortfall_cycle--; goto dolaundry; /* * There's no immediate need to launder any pages; see if we * meet the conditions to perform background laundering: * * 1. The ratio of dirty to clean inactive pages exceeds the * background laundering threshold, or * 2. we haven't yet reached the target of the current * background laundering run. * * The background laundering threshold is not a constant. * Instead, it is a slowly growing function of the number of * clean pages freed by the page daemon since the last * background laundering. Thus, as the ratio of dirty to * clean inactive pages grows, the amount of memory pressure * required to trigger laundering decreases. We ensure * that the threshold is non-zero after an inactive queue * scan, even if that scan failed to free a single clean page. */ trybackground: nclean = vmd->vmd_free_count + vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt; ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt; if (target == 0 && ndirty * isqrt(howmany(nfreed + 1, vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) { target = vmd->vmd_background_launder_target; } /* * We have a non-zero background laundering target. If we've * laundered up to our maximum without observing a page daemon * request, just stop. This is a safety belt that ensures we * don't launder an excessive amount if memory pressure is low * and the ratio of dirty to clean pages is large. Otherwise, * proceed at the background laundering rate. */ if (target > 0) { if (nfreed > 0) { nfreed = 0; last_target = target; } else if (last_target - target >= vm_background_launder_max * PAGE_SIZE / 1024) { target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } dolaundry: if (launder > 0) { /* * Because of I/O clustering, the number of laundered * pages could exceed "target" by the maximum size of * a cluster minus one. */ target -= min(vm_pageout_launder(vmd, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } /* * If we're not currently laundering pages and the page daemon * hasn't posted a new request, sleep until the page daemon * kicks us. */ vm_pagequeue_lock(pq); if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE) (void)mtx_sleep(&vmd->vmd_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* * If the pagedaemon has indicated that it's in shortfall, start * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { shortfall = vm_laundry_target(vmd) + vmd->vmd_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) vmd->vmd_laundry_request = VM_LAUNDRY_IDLE; nfreed += vmd->vmd_clean_pages_freed; vmd->vmd_clean_pages_freed = 0; vm_pagequeue_unlock(pq); } } /* * Compute the number of pages we want to try to move from the * active queue to either the inactive or laundry queue. * * When scanning active pages during a shortage, we make clean pages * count more heavily towards the page shortage than dirty pages. * This is because dirty pages must be laundered before they can be * reused and thus have less utility when attempting to quickly * alleviate a free page shortage. However, this weighting also * causes the scan to deactivate dirty pages more aggressively, * improving the effectiveness of clustering. */ static int vm_pageout_active_target(struct vm_domain *vmd) { int shortage; shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) - (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt + vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight); shortage *= act_scan_laundry_weight; return (shortage); } /* * Scan the active queue. If there is no shortage of inactive pages, scan a * small portion of the queue in order to maintain quasi-LRU. */ static void vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) { struct scan_state ss; + struct mtx *mtx; vm_object_t object; vm_page_t m, marker; - vm_page_astate_t old, new; struct vm_pagequeue *pq; long min_scan; - int act_delta, max_scan, ps_delta, refs, scan_tick; - uint8_t nqueue; + int act_delta, max_scan, scan_tick; marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. * * To avoid requeuing each page that remains in the active queue, we * implement the CLOCK algorithm. To keep the implementation of the * enqueue operation consistent for all page queues, we use two hands, * represented by marker pages. Scans begin at the first hand, which * precedes the second hand in the queue. When the two hands meet, * they are moved back to the head and tail of the queue, respectively, * and scanning resumes. */ max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan; + mtx = NULL; act_scan: vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan); while ((m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false(m == &vmd->vmd_clock[1])) { vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); max_scan -= ss.scanned; vm_pageout_end_scan(&ss); goto act_scan; } if (__predict_false((m->flags & PG_MARKER) != 0)) continue; + vm_page_change_lock(m, &mtx); + + /* + * The page may have been disassociated from the queue + * or even freed while locks were dropped. We thus must be + * careful whenever modifying page state. Once the object lock + * has been acquired, we have a stable reference to the page. + */ + if (vm_page_queue(m) != PQ_ACTIVE) + continue; + + /* + * Wired pages are dequeued lazily. + */ + if (vm_page_wired(m)) { + vm_page_dequeue_deferred(m); + continue; + } + + /* + * A page's object pointer may be set to NULL before + * the object lock is acquired. + */ object = (vm_object_t)atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* * The page has been removed from its object. */ continue; /* * Check to see "how much" the page has been used. * * Test PGA_REFERENCED after calling pmap_ts_referenced() so * that a reference from a concurrently destroyed mapping is * observed here and now. * - * Perform an unsynchronized object ref count check. While the - * page lock ensures that the page is not reallocated to another - * object, in particular, one with unmanaged mappings that - * cannot support pmap_ts_referenced(), two races are, + * Perform an unsynchronized object ref count check. While + * the page lock ensures that the page is not reallocated to + * another object, in particular, one with unmanaged mappings + * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: - * * 1) The count was transitioning to zero, but we saw a non- - * zero value. pmap_ts_referenced() will return zero because - * the page is not mapped. - * 2) The count was transitioning to one, but we saw zero. This - * race delays the detection of a new reference. At worst, - * we will deactivate and reactivate the page. + * zero value. pmap_ts_referenced() will return zero + * because the page is not mapped. + * 2) The count was transitioning to one, but we saw zero. + * This race delays the detection of a new reference. At + * worst, we will deactivate and reactivate the page. */ - refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + if (object->ref_count != 0) + act_delta = pmap_ts_referenced(m); + else + act_delta = 0; + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta++; + } - for (old = vm_page_astate_load(m);;) { - if (old.queue != PQ_ACTIVE || - (old.flags & PGA_ENQUEUED) == 0) - /* - * Something has moved the page out of the - * active queue. Don't touch it. - */ - break; - if ((old.flags & PGA_DEQUEUE) != 0) { - vm_page_pqbatch_submit(m, PQ_ACTIVE); - break; - } + /* + * Advance or decay the act_count based on recent usage. + */ + if (act_delta != 0) { + m->act_count += ACT_ADVANCE + act_delta; + if (m->act_count > ACT_MAX) + m->act_count = ACT_MAX; + } else + m->act_count -= min(m->act_count, ACT_DECLINE); - new = old; - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - + if (m->act_count == 0) { /* - * Advance or decay the act_count based on recent usage. + * When not short for inactive pages, let dirty pages go + * through the inactive queue before moving to the + * laundry queues. This gives them some extra time to + * be reactivated, potentially avoiding an expensive + * pageout. However, during a page shortage, the + * inactive queue is necessarily small, and so dirty + * pages would only spend a trivial amount of time in + * the inactive queue. Therefore, we might as well + * place them directly in the laundry queue to reduce + * queuing overhead. */ - if (act_delta != 0) { - new.act_count += ACT_ADVANCE + act_delta; - if (new.act_count > ACT_MAX) - new.act_count = ACT_MAX; + if (page_shortage <= 0) { + vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE); } else { - new.act_count -= min(new.act_count, ACT_DECLINE); - } - - if (new.act_count > 0) { /* - * Adjust the activation count and keep the page - * in the active queue. The count might be left - * unchanged if it is saturated. - */ - if (new.act_count == old.act_count || - vm_page_astate_fcmpset(m, &old, new)) - break; - } else { - /* - * When not short for inactive pages, let dirty - * pages go through the inactive queue before - * moving to the laundry queues. This gives - * them some extra time to be reactivated, - * potentially avoiding an expensive pageout. - * However, during a page shortage, the inactive - * queue is necessarily small, and so dirty - * pages would only spend a trivial amount of - * time in the inactive queue. Therefore, we - * might as well place them directly in the - * laundry queue to reduce queuing overhead. - * * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, - * directing dirty pages into the laundry queue - * is only an optimization and not a + * directing dirty pages into the laundry + * queue is only an optimization and not a * requirement. Therefore, we simply rely on - * the opportunistic updates to the page's dirty - * field by the pmap. + * the opportunistic updates to the page's + * dirty field by the pmap. */ - if (page_shortage <= 0) { - nqueue = PQ_INACTIVE; - ps_delta = 0; - } else if (m->dirty == 0) { - nqueue = PQ_INACTIVE; - ps_delta = act_scan_laundry_weight; + if (m->dirty == 0) { + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_INACTIVE); + page_shortage -= + act_scan_laundry_weight; } else { - nqueue = PQ_LAUNDRY; - ps_delta = 1; + vm_page_swapqueue(m, PQ_ACTIVE, + PQ_LAUNDRY); + page_shortage--; } - - new.flags |= PGA_REQUEUE; - new.queue = nqueue; - if (vm_page_pqstate_commit(m, &old, new)) { - page_shortage -= ps_delta; - break; - } } } } + if (mtx != NULL) { + mtx_unlock(mtx); + mtx = NULL; + } vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); } static int vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m) { struct vm_domain *vmd; - vm_page_astate_t old, new; - for (old = vm_page_astate_load(m);;) { - if (old.queue != PQ_INACTIVE || - (old.flags & (PGA_DEQUEUE | PGA_ENQUEUED)) != 0) - break; - - new = old; - new.flags |= PGA_ENQUEUED; - new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); - if (!vm_page_astate_fcmpset(m, &old, new)) - continue; - - if ((old.flags & PGA_REQUEUE_HEAD) != 0) { - vmd = vm_pagequeue_domain(m); - TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); - } else if ((old.flags & PGA_REQUEUE) != 0) { - TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q); - } else { - TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q); - } - return (1); - } - return (0); + if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0) + return (0); + vm_page_aflag_set(m, PGA_ENQUEUED); + if ((m->aflags & PGA_REQUEUE_HEAD) != 0) { + vmd = vm_pagequeue_domain(m); + TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); + vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD); + } else if ((m->aflags & PGA_REQUEUE) != 0) { + TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q); + vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD); + } else + TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q); + return (1); } /* * Re-add stuck pages to the inactive queue. We will examine them again * during the next scan. If the queue state of a page has changed since * it was physically removed from the page queue in * vm_pageout_collect_batch(), don't do anything with that page. */ static void vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq, vm_page_t m) { struct vm_pagequeue *pq; int delta; delta = 0; pq = ss->pq; if (m != NULL) { if (vm_batchqueue_insert(bq, m)) return; vm_pagequeue_lock(pq); delta += vm_pageout_reinsert_inactive_page(ss, m); } else vm_pagequeue_lock(pq); while ((m = vm_batchqueue_pop(bq)) != NULL) delta += vm_pageout_reinsert_inactive_page(ss, m); vm_pagequeue_cnt_add(pq, delta); vm_pagequeue_unlock(pq); vm_batchqueue_init(bq); } /* * Attempt to reclaim the requested number of pages from the inactive queue. * Returns true if the shortage was addressed. */ static int vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage) { struct scan_state ss; struct vm_batchqueue rq; + struct mtx *mtx; vm_page_t m, marker; - vm_page_astate_t old, new; struct vm_pagequeue *pq; vm_object_t object; - int act_delta, addl_page_shortage, deficit, page_shortage, refs; + int act_delta, addl_page_shortage, deficit, page_shortage; int starting_page_shortage; /* * The addl_page_shortage is an estimate of the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * vmd_pageout_deficit counts the number of pages requested in * allocations that failed because of a free page shortage. We assume * that the allocations will be reattempted and thus include the deficit * in our scan target. */ deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); starting_page_shortage = page_shortage = shortage + deficit; + mtx = NULL; object = NULL; vm_batchqueue_init(&rq); /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ marker = &vmd->vmd_markers[PQ_INACTIVE]; pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) { KASSERT((m->flags & PG_MARKER) == 0, ("marker page %p was dequeued", m)); + vm_page_change_lock(m, &mtx); + +recheck: /* - * Perform some quick and racy checks of the page's queue state. - * Bail if things are not as we expect. + * The page may have been disassociated from the queue + * or even freed while locks were dropped. We thus must be + * careful whenever modifying page state. Once the object lock + * has been acquired, we have a stable reference to the page. */ - old = vm_page_astate_load(m); - if (old.queue != PQ_INACTIVE || (old.flags & PGA_ENQUEUED) != 0) + if (vm_page_queue(m) != PQ_INACTIVE) { + addl_page_shortage++; continue; - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); + } + + /* + * The page was re-enqueued after the page queue lock was + * dropped, or a requeue was requested. This page gets a second + * chance. + */ + if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE | + PGA_REQUEUE_HEAD)) != 0) + goto reinsert; + + /* + * Wired pages may not be freed. Complete their removal + * from the queue now to avoid needless revisits during + * future scans. This check is racy and must be reverified once + * we hold the object lock and have verified that the page + * is not busy. + */ + if (vm_page_wired(m)) { + vm_page_dequeue_deferred(m); continue; } if (object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); + + /* + * A page's object pointer may be set to NULL before + * the object lock is acquired. + */ object = (vm_object_t)atomic_load_ptr(&m->object); - if (object == NULL) - continue; - VM_OBJECT_WLOCK(object); - if (m->object != object) { - VM_OBJECT_WUNLOCK(object); - object = NULL; - goto reinsert; + if (object != NULL && !VM_OBJECT_TRYWLOCK(object)) { + mtx_unlock(mtx); + /* Depends on type-stability. */ + VM_OBJECT_WLOCK(object); + mtx_lock(mtx); + goto recheck; } } + if (__predict_false(m->object == NULL)) + /* + * The page has been removed from its object. + */ + continue; + KASSERT(m->object == object, ("page %p does not belong to %p", + m, object)); if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; goto reinsert; } /* - * Check for wirings now that we hold the object lock and have - * verified that the page is unbusied. If the page is mapped, - * it may still be wired by pmap lookups. The call to + * Re-check for wirings now that we hold the object lock and + * have verified that the page is unbusied. If the page is + * mapped, it may still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase. */ if (__predict_false(vm_page_wired(m))) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); + vm_page_dequeue_deferred(m); continue; } /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. * * Test PGA_REFERENCED after calling pmap_ts_referenced() so * that a reference from a concurrently destroyed mapping is * observed here and now. */ - refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; + if (object->ref_count != 0) + act_delta = pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + act_delta = 0; + } + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta++; + } + if (act_delta != 0) { + if (object->ref_count != 0) { + VM_CNT_INC(v_reactivated); + vm_page_activate(m); - for (old = vm_page_astate_load(m);;) { - if (old.queue != PQ_INACTIVE || - (old.flags & PGA_ENQUEUED) != 0) - goto next_page; - - if ((old.flags & PGA_QUEUE_OP_MASK) != 0) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); - goto next_page; + /* + * Increase the activation count if the page + * was referenced while in the inactive queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; + continue; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_page_aflag_set(m, PGA_REQUEUE); + goto reinsert; } - - new = old; - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - if (act_delta != 0) { - if (object->ref_count != 0) { - /* - * Increase the activation count if the - * page was referenced while in the - * inactive queue. This makes it less - * likely that the page will be returned - * prematurely to the inactive queue. - */ - new.act_count += ACT_ADVANCE + - act_delta; - if (new.act_count > ACT_MAX) - new.act_count = ACT_MAX; - - new.flags |= PGA_REQUEUE; - new.queue = PQ_ACTIVE; - if (!vm_page_pqstate_commit(m, &old, - new)) - continue; - - VM_CNT_INC(v_reactivated); - goto next_page; - } else if ((object->flags & OBJ_DEAD) == 0) { - vm_page_aflag_set(m, PGA_REQUEUE); - goto reinsert; - } - } - break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); + vm_page_dequeue_deferred(m); continue; } } /* * Clean pages can be freed, but dirty pages must be sent back * to the laundry, unless they belong to a dead object. * Requeueing dirty pages from dead objects is pointless, as * they are being paged out and freed by the thread that * destroyed the object. */ if (m->dirty == 0) { free_page: - /* XXX comment */ - old = vm_page_astate_load(m); - if (old.queue != PQ_INACTIVE || - (old.flags & PGA_QUEUE_STATE_MASK) != 0) { - vm_page_pqbatch_submit(m, PQ_INACTIVE); - goto next_page; - } - /* * Because we dequeued the page and have already * checked for concurrent dequeue and enqueue * requests, we can safely disassociate the page * from the inactive queue. */ - m->astate.queue = PQ_NONE; + KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0, + ("page %p has queue state", m)); + m->queue = PQ_NONE; vm_page_free(m); page_shortage--; } else if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); -next_page: continue; reinsert: vm_pageout_reinsert_inactive(&ss, &rq, m); } + if (mtx != NULL) + mtx_unlock(mtx); if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_pageout_reinsert_inactive(&ss, &rq, NULL); vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL); vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage); /* * Wake up the laundry thread so that it can perform any needed * laundering. If we didn't meet our target, we're in shortfall and * need to launder more aggressively. If PQ_LAUNDRY is empty and no * swap devices are configured, the laundry thread has no work to do, so * don't bother waking it up. * * The laundry thread uses the number of inactive queue scans elapsed * since the last laundering to determine whether to launder again, so * keep count. */ if (starting_page_shortage > 0) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE && (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { if (page_shortage > 0) { vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL; VM_CNT_INC(v_pdshortfalls); } else if (vmd->vmd_laundry_request != VM_LAUNDRY_SHORTFALL) vmd->vmd_laundry_request = VM_LAUNDRY_BACKGROUND; wakeup(&vmd->vmd_laundry_request); } vmd->vmd_clean_pages_freed += starting_page_shortage - page_shortage; vm_pagequeue_unlock(pq); } /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (page_shortage > 0) vm_swapout_run(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Reclaim pages by swapping out idle processes, if configured to do so. */ vm_swapout_run_idle(); /* * See the description of addl_page_shortage above. */ *addl_shortage = addl_page_shortage + deficit; return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } static int vm_oom_ratelim_last; static int vm_oom_pf_secs = 10; SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0, ""); static struct mtx vm_oom_ratelim_mtx; void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; int now; bool breakout; /* * For OOM requests originating from vm_fault(), there is a high * chance that a single large process faults simultaneously in * several threads. Also, on an active system running many * processes of middle-size, like buildworld, all of them * could fault almost simultaneously as well. * * To avoid killing too many processes, rate-limit OOMs * initiated by vm_fault() time-outs on the waits for free * pages. */ mtx_lock(&vm_oom_ratelim_mtx); now = ticks; if (shortage == VM_OOM_MEM_PF && (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) { mtx_unlock(&vm_oom_ratelim_mtx); return; } vm_oom_ratelim_last = now; mtx_unlock(&vm_oom_ratelim_mtx); /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = false; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = true; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); } } static bool vm_pageout_lowmem(void) { static int lowmem_ticks = 0; int last; last = atomic_load_int(&lowmem_ticks); while ((u_int)(ticks - last) / hz >= lowmem_period) { if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0) continue; /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. If we have a severe page shortage on * our hands, completely drain all UMA zones. Otherwise, * just prune the caches. */ uma_reclaim(vm_page_count_min() ? UMA_RECLAIM_DRAIN_CPU : UMA_RECLAIM_TRIM); return (true); } return (false); } static void vm_pageout_worker(void *arg) { struct vm_domain *vmd; u_int ofree; int addl_shortage, domain, shortage; bool target_met; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); shortage = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vmd->vmd_last_active_scan = ticks; /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { vm_domain_pageout_lock(vmd); /* * We need to clear wanted before we check the limits. This * prevents races with wakers who will check wanted after they * reach the limit. */ atomic_store_int(&vmd->vmd_pageout_wanted, 0); /* * Might the page daemon need to run again? */ if (vm_paging_needed(vmd, vmd->vmd_free_count)) { /* * Yes. If the scan failed to produce enough free * pages, sleep uninterruptibly for some time in the * hope that the laundry thread will clean some pages. */ vm_domain_pageout_unlock(vmd); if (!target_met) pause("pwait", hz / VM_INACT_SCAN_RATE); } else { /* * No, sleep until the next wakeup or until pages * need to have their reference stats updated. */ if (mtx_sleep(&vmd->vmd_pageout_wanted, vm_domain_pageout_lockptr(vmd), PDROP | PVM, "psleep", hz / VM_INACT_SCAN_RATE) == 0) VM_CNT_INC(v_pdwakeups); } /* Prevent spurious wakeups by ensuring that wanted is set. */ atomic_store_int(&vmd->vmd_pageout_wanted, 1); /* * Use the controller to calculate how many pages to free in * this interval, and scan the inactive queue. If the lowmem * handlers appear to have freed up some pages, subtract the * difference from the inactive queue scan target. */ shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count); if (shortage > 0) { ofree = vmd->vmd_free_count; if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree) shortage -= min(vmd->vmd_free_count - ofree, (u_int)shortage); target_met = vm_pageout_scan_inactive(vmd, shortage, &addl_shortage); } else addl_shortage = 0; /* * Scan the active queue. A positive value for shortage * indicates that we must aggressively deactivate pages to avoid * a shortfall. */ shortage = vm_pageout_active_target(vmd) + addl_shortage; vm_pageout_scan_active(vmd, shortage); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init_domain(int domain) { struct vm_domain *vmd; struct sysctl_oid *oid; vmd = VM_DOMAIN(domain); vmd->vmd_interrupt_free_min = 2; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vmd->vmd_page_count > 1024) vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200; else vmd->vmd_free_min = 4; vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE + vmd->vmd_interrupt_free_min; vmd->vmd_free_reserved = vm_pageout_page_count + vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768); vmd->vmd_free_severe = vmd->vmd_free_min / 2; vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved; vmd->vmd_free_min += vmd->vmd_free_reserved; vmd->vmd_free_severe += vmd->vmd_free_reserved; vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2; if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3) vmd->vmd_inactive_target = vmd->vmd_free_count / 3; /* * Set the default wakeup threshold to be 10% below the paging * target. This keeps the steady state out of shortfall. */ vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9; /* * Target amount of memory to move out of the laundry queue during a * background laundering. This is proportional to the amount of system * memory. */ vmd->vmd_background_launder_target = (vmd->vmd_free_target - vmd->vmd_free_min) / 10; /* Initialize the pageout daemon pid controller. */ pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE, vmd->vmd_free_target, PIDCTRL_BOUND, PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "pidctrl", CTLFLAG_RD, NULL, ""); pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid)); } static void vm_pageout_init(void) { u_int freecount; int i; /* * Initialize some paging parameters. */ if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; freecount = 0; for (i = 0; i < vm_ndomains; i++) { struct vm_domain *vmd; vm_pageout_init_domain(i); vmd = VM_DOMAIN(i); vm_cnt.v_free_reserved += vmd->vmd_free_reserved; vm_cnt.v_free_target += vmd->vmd_free_target; vm_cnt.v_free_min += vmd->vmd_free_min; vm_cnt.v_inactive_target += vmd->vmd_inactive_target; vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min; vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min; vm_cnt.v_free_severe += vmd->vmd_free_severe; freecount += vmd->vmd_free_count; } /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; if (vm_page_max_user_wired == 0) vm_page_max_user_wired = freecount / 3; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { struct proc *p; struct thread *td; int error, first, i; p = curproc; td = curthread; mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); for (first = -1, i = 0; i < vm_ndomains; i++) { if (VM_DOMAIN_EMPTY(i)) { if (bootverbose) printf("domain %d empty; skipping pageout\n", i); continue; } if (first == -1) first = i; else { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i); if (error != 0) panic("starting pageout for domain %d: %d\n", i, error); } error = kthread_add(vm_pageout_laundry_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i); if (error != 0) panic("starting laundry for domain %d: %d", i, error); } error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); snprintf(td->td_name, sizeof(td->td_name), "dom%d", first); vm_pageout_worker((void *)(uintptr_t)first); } /* * Perform an advisory wakeup of the page daemon. */ void pagedaemon_wakeup(int domain) { struct vm_domain *vmd; vmd = VM_DOMAIN(domain); vm_domain_pageout_assert_unlocked(vmd); if (curproc == pageproc) return; if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) { vm_domain_pageout_lock(vmd); atomic_store_int(&vmd->vmd_pageout_wanted, 1); wakeup(&vmd->vmd_pageout_wanted); vm_domain_pageout_unlock(vmd); } } Index: head/sys/vm/vm_pagequeue.h =================================================================== --- head/sys/vm/vm_pagequeue.h (revision 352406) +++ head/sys/vm/vm_pagequeue.h (revision 352407) @@ -1,343 +1,325 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_PAGEQUEUE_ #define _VM_PAGEQUEUE_ #ifdef _KERNEL struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; int pq_cnt; const char * const pq_name; uint64_t pq_pdpages; } __aligned(CACHE_LINE_SIZE); #ifndef VM_BATCHQUEUE_SIZE #define VM_BATCHQUEUE_SIZE 7 #endif struct vm_batchqueue { vm_page_t bq_pa[VM_BATCHQUEUE_SIZE]; int bq_cnt; } __aligned(CACHE_LINE_SIZE); #include #include struct sysctl_oid; /* * One vm_domain per-numa domain. Contains pagequeues, free page structures, * and accounting. * * Lock Key: * f vmd_free_mtx * p vmd_pageout_mtx * d vm_domainset_lock * a atomic * c const after boot * q page queue lock */ struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; struct mtx_padalign vmd_free_mtx; struct mtx_padalign vmd_pageout_mtx; struct vm_pgcache { int domain; int pool; uma_zone_t zone; } vmd_pgcache[VM_NFREEPOOL]; struct vmem *vmd_kernel_arena; /* (c) per-domain kva R/W arena. */ struct vmem *vmd_kernel_rwx_arena; /* (c) per-domain kva R/W/X arena. */ u_int vmd_domain; /* (c) Domain number. */ u_int vmd_page_count; /* (c) Total page count. */ long vmd_segs; /* (c) bitmask of the segments */ u_int __aligned(CACHE_LINE_SIZE) vmd_free_count; /* (a,f) free page count */ u_int vmd_pageout_deficit; /* (a) Estimated number of pages deficit */ uint8_t vmd_pad[CACHE_LINE_SIZE - (sizeof(u_int) * 2)]; /* Paging control variables, used within single threaded page daemon. */ struct pidctrl vmd_pid; /* Pageout controller. */ boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; struct vm_page vmd_markers[PQ_COUNT]; /* (q) markers for queue scans */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ struct vm_page vmd_clock[2]; /* markers for active queue scan */ int vmd_pageout_wanted; /* (a, p) pageout daemon wait channel */ int vmd_pageout_pages_needed; /* (d) page daemon waiting for pages? */ bool vmd_minset; /* (d) Are we in vm_min_domains? */ bool vmd_severeset; /* (d) Are we in vm_severe_domains? */ enum { VM_LAUNDRY_IDLE = 0, VM_LAUNDRY_BACKGROUND, VM_LAUNDRY_SHORTFALL } vmd_laundry_request; /* Paging thresholds and targets. */ u_int vmd_clean_pages_freed; /* (q) accumulator for laundry thread */ u_int vmd_background_launder_target; /* (c) */ u_int vmd_free_reserved; /* (c) pages reserved for deadlock */ u_int vmd_free_target; /* (c) pages desired free */ u_int vmd_free_min; /* (c) pages desired free */ u_int vmd_inactive_target; /* (c) pages desired inactive */ u_int vmd_pageout_free_min; /* (c) min pages reserved for kernel */ u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */ u_int vmd_interrupt_free_min; /* (c) reserved pages for int code */ u_int vmd_free_severe; /* (c) severe page depletion point */ /* Name for sysctl etc. */ struct sysctl_oid *vmd_oid; char vmd_name[sizeof(__XSTRING(MAXMEMDOM))]; } __aligned(CACHE_LINE_SIZE); extern struct vm_domain vm_dom[MAXMEMDOM]; #define VM_DOMAIN(n) (&vm_dom[(n)]) #define VM_DOMAIN_EMPTY(n) (vm_dom[(n)].vmd_page_count == 0) #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_trylock(pq) mtx_trylock(&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #define vm_domain_free_assert_locked(n) \ mtx_assert(vm_domain_free_lockptr((n)), MA_OWNED) #define vm_domain_free_assert_unlocked(n) \ mtx_assert(vm_domain_free_lockptr((n)), MA_NOTOWNED) #define vm_domain_free_lock(d) \ mtx_lock(vm_domain_free_lockptr((d))) #define vm_domain_free_lockptr(d) \ (&(d)->vmd_free_mtx) #define vm_domain_free_trylock(d) \ mtx_trylock(vm_domain_free_lockptr((d))) #define vm_domain_free_unlock(d) \ mtx_unlock(vm_domain_free_lockptr((d))) #define vm_domain_pageout_lockptr(d) \ (&(d)->vmd_pageout_mtx) #define vm_domain_pageout_assert_locked(n) \ mtx_assert(vm_domain_pageout_lockptr((n)), MA_OWNED) #define vm_domain_pageout_assert_unlocked(n) \ mtx_assert(vm_domain_pageout_lockptr((n)), MA_NOTOWNED) #define vm_domain_pageout_lock(d) \ mtx_lock(vm_domain_pageout_lockptr((d))) #define vm_domain_pageout_unlock(d) \ mtx_unlock(vm_domain_pageout_lockptr((d))) static __inline void vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { vm_pagequeue_assert_locked(pq); pq->pq_cnt += addend; } #define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) static inline void vm_pagequeue_remove(struct vm_pagequeue *pq, vm_page_t m) { - vm_pagequeue_assert_locked(pq); - TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } static inline void vm_batchqueue_init(struct vm_batchqueue *bq) { bq->bq_cnt = 0; } static inline bool vm_batchqueue_insert(struct vm_batchqueue *bq, vm_page_t m) { if (bq->bq_cnt < nitems(bq->bq_pa)) { bq->bq_pa[bq->bq_cnt++] = m; return (true); } return (false); } static inline vm_page_t vm_batchqueue_pop(struct vm_batchqueue *bq) { if (bq->bq_cnt == 0) return (NULL); return (bq->bq_pa[--bq->bq_cnt]); } void vm_domain_set(struct vm_domain *vmd); void vm_domain_clear(struct vm_domain *vmd); int vm_domain_allocate(struct vm_domain *vmd, int req, int npages); /* * vm_pagequeue_domain: * * Return the memory domain the page belongs to. */ static inline struct vm_domain * vm_pagequeue_domain(vm_page_t m) { return (VM_DOMAIN(vm_phys_domain(m))); -} - -static inline struct vm_pagequeue * -_vm_page_pagequeue(vm_page_t m, uint8_t queue) -{ - - if (queue == PQ_NONE) - return (NULL); - return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); -} - -static inline struct vm_pagequeue * -vm_page_pagequeue(vm_page_t m) -{ - - return (_vm_page_pagequeue(m, atomic_load_8(&m->astate.queue))); } /* * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. */ static inline int vm_paging_target(struct vm_domain *vmd) { return (vmd->vmd_free_target - vmd->vmd_free_count); } /* * Returns TRUE if the pagedaemon needs to be woken up. */ static inline int vm_paging_needed(struct vm_domain *vmd, u_int free_count) { return (free_count < vmd->vmd_pageout_wakeup_thresh); } /* * Returns TRUE if the domain is below the min paging target. */ static inline int vm_paging_min(struct vm_domain *vmd) { return (vmd->vmd_free_min > vmd->vmd_free_count); } /* * Returns TRUE if the domain is below the severe paging target. */ static inline int vm_paging_severe(struct vm_domain *vmd) { return (vmd->vmd_free_severe > vmd->vmd_free_count); } /* * Return the number of pages we need to launder. * A positive number indicates that we have a shortfall of clean pages. */ static inline int vm_laundry_target(struct vm_domain *vmd) { return (vm_paging_target(vmd)); } void pagedaemon_wakeup(int domain); static inline void vm_domain_freecnt_inc(struct vm_domain *vmd, int adj) { u_int old, new; old = atomic_fetchadd_int(&vmd->vmd_free_count, adj); new = old + adj; /* * Only update bitsets on transitions. Notice we short-circuit the * rest of the checks if we're above min already. */ if (old < vmd->vmd_free_min && (new >= vmd->vmd_free_min || (old < vmd->vmd_free_severe && new >= vmd->vmd_free_severe) || (old < vmd->vmd_pageout_free_min && new >= vmd->vmd_pageout_free_min))) vm_domain_clear(vmd); } #endif /* _KERNEL */ #endif /* !_VM_PAGEQUEUE_ */ Index: head/sys/vm/vm_swapout.c =================================================================== --- head/sys/vm/vm_swapout.c (revision 352406) +++ head/sys/vm/vm_swapout.c (revision 352407) @@ -1,989 +1,967 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include #include +#include #include #include #include /* the kernel process "vm_daemon" */ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); static int swapped_cnt; static int swap_inprogress; /* Pending swap-ins done outside swapper. */ static int last_swapin; static void swapclear(struct proc *); static int swapout(struct proc *); static void vm_swapout_map_deactivate_pages(vm_map_t, long); static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long); static void swapout_procs(int action); static void vm_req_vmdaemon(int req); static void vm_thread_swapout(struct thread *td); -static void -vm_swapout_object_deactivate_page(vm_page_t m, int remove_mode) -{ - vm_page_astate_t old, new; - int act_delta, refs; - - refs = pmap_ts_referenced(m); - - for (old = vm_page_astate_load(m);;) { - if ((old.flags & PGA_DEQUEUE) != 0) - break; - - act_delta = refs; - if ((old.flags & PGA_REFERENCED) != 0) { - new.flags &= ~PGA_REFERENCED; - act_delta++; - } - - if (old.queue != PQ_ACTIVE && act_delta != 0) { - if (new.act_count == ACT_MAX) - break; - new.act_count += act_delta; - new.flags |= PGA_REQUEUE; - new.queue = PQ_ACTIVE; - if (vm_page_pqstate_commit(m, &old, new)) - break; - } else if (old.queue == PQ_ACTIVE) { - if (act_delta == 0) { - new.act_count -= min(new.act_count, - ACT_DECLINE); - if (!remove_mode && new.act_count == 0) { - (void)vm_page_try_remove_all(m); - - new.flags |= PGA_REQUEUE; - new.queue = PQ_INACTIVE; - } - if (vm_page_pqstate_commit(m, &old, new)) - break; - } else { - if (new.act_count < ACT_MAX - ACT_ADVANCE) - new.act_count += ACT_ADVANCE; - if (vm_page_astate_fcmpset(m, &old, new)) - break; - } - } else { - (void)vm_page_try_remove_all(m); - } - } -} - /* * vm_swapout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; - int remove_mode; + int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || REFCOUNT_COUNT(object->paging_in_progress) > 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (should_yield()) goto unlock_return; /* * The page may acquire a wiring after this check. * The page daemon handles wired pages, so there is * no harm done if a wiring appears while we are * attempting to deactivate the page. */ if (vm_page_busied(p) || vm_page_wired(p)) continue; VM_CNT_INC(v_pdpages); if (!pmap_page_exists_quick(pmap, p)) continue; - - vm_swapout_object_deactivate_page(p, remove_mode); + act_delta = pmap_ts_referenced(p); + vm_page_lock(p); + if ((p->aflags & PGA_REFERENCED) != 0) { + if (act_delta == 0) + act_delta = 1; + vm_page_aflag_clear(p, PGA_REFERENCED); + } + if (!vm_page_active(p) && act_delta != 0) { + vm_page_activate(p); + p->act_count += act_delta; + } else if (vm_page_active(p)) { + /* + * The page daemon does not requeue pages + * after modifying their activation count. + */ + if (act_delta == 0) { + p->act_count -= min(p->act_count, + ACT_DECLINE); + if (!remove_mode && p->act_count == 0) { + (void)vm_page_try_remove_all(p); + vm_page_deactivate(p); + } + } else { + vm_page_activate(p); + if (p->act_count < ACT_MAX - + ACT_ADVANCE) + p->act_count += ACT_ADVANCE; + } + } else if (vm_page_inactive(p)) + (void)vm_page_try_remove_all(p); + vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_swapout_map_deactivate_pages(vm_map_t map, long desired) { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock_read(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_swapout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock_read(map); } /* * Swap out requests */ #define VM_SWAP_NORMAL 1 #define VM_SWAP_IDLE 2 void vm_swapout_run(void) { if (vm_swap_enabled) vm_req_vmdaemon(VM_SWAP_NORMAL); } /* * Idle process swapout -- run once per second when pagedaemons are * reclaiming pages. */ void vm_swapout_run_idle(void) { static long lsec; if (!vm_swap_idle_enabled || time_second == lsec) return; vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags != 0) { /* * Drain the per-CPU page queue batches as a deadlock * avoidance measure. */ if ((swapout_flags & VM_SWAP_NORMAL) != 0) vm_page_pqbatch_drain(); swapout_procs(swapout_flags); } /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); _PHOLD_LITE(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); continue; } sx_sunlock(&allproc_lock); size = vmspace_resident_count(vm); if (size >= limit) { vm_swapout_map_deactivate_pages( &vm->vm_map, limit); size = vmspace_resident_count(vm); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_swapout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) { maybe_yield(); goto again; } } } /* * Allow a thread's kernel stack to be paged out. */ static void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_dirty(m); vm_page_unwire(m, PQ_LAUNDRY); } VM_OBJECT_WUNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ static void vm_thread_swapin(struct thread *td, int oom_alloc) { vm_object_t ksobj; vm_page_t ma[KSTACK_MAX_PAGES]; int a, count, i, j, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_WLOCK(ksobj); (void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma, pages); for (i = 0; i < pages;) { vm_page_assert_xbusied(ma[i]); if (ma[i]->valid == VM_PAGE_BITS_ALL) { vm_page_xunbusy(ma[i]); i++; continue; } vm_object_pip_add(ksobj, 1); for (j = i + 1; j < pages; j++) if (ma[j]->valid == VM_PAGE_BITS_ALL) break; rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a); KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i])); count = min(a + 1, j - i); rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL); KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d", __func__, td->td_proc->p_pid)); vm_object_pip_wakeup(ksobj); for (j = i; j < i + count; j++) vm_page_xunbusy(ma[j]); i += count; } VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } void faultin(struct proc *p) { struct thread *td; int oom_alloc; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_flag & P_SWAPPINGIN) { while (p->p_flag & P_SWAPPINGIN) msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); return; } if ((p->p_flag & P_INMEM) == 0) { oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; p->p_flag |= P_SWAPPINGIN; PROC_UNLOCK(p); sx_xlock(&allproc_lock); MPASS(swapped_cnt > 0); swapped_cnt--; if (curthread != &thread0) swap_inprogress++; sx_xunlock(&allproc_lock); /* * We hold no lock here because the list of threads * can not change while all threads in the process are * swapped out. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td, oom_alloc); if (curthread != &thread0) { sx_xlock(&allproc_lock); MPASS(swap_inprogress > 0); swap_inprogress--; last_swapin = ticks; sx_xunlock(&allproc_lock); } PROC_LOCK(p); swapclear(p); p->p_swtick = ticks; /* Allow other threads to swap p out now. */ wakeup(&p->p_flag); --p->p_lock; } } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. */ static struct proc * swapper_selector(bool wkilled_only) { struct proc *p, *res; struct thread *td; int ppri, pri, slptime, swtime; sx_assert(&allproc_lock, SA_SLOCKED); if (swapped_cnt == 0) return (NULL); res = NULL; ppri = INT_MIN; FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) != 0) { PROC_UNLOCK(p); continue; } if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) { /* * A swapped-out process might have mapped a * large portion of the system's pages as * anonymous memory. There is no other way to * release the memory other than to kill the * process, for which we need to swap it in. */ return (p); } if (wkilled_only) { PROC_UNLOCK(p); continue; } swtime = (ticks - p->p_swtick) / hz; FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. */ thread_lock(td); if (td->td_inhibitors == TDI_SWAPPED) { slptime = (ticks - td->td_slptick) / hz; pri = swtime + slptime; if ((td->td_flags & TDF_SWAPINREQ) == 0) pri -= p->p_nice * 8; /* * if this thread is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { res = p; ppri = pri; } } thread_unlock(td); } PROC_UNLOCK(p); } if (res != NULL) PROC_LOCK(res); return (res); } #define SWAPIN_INTERVAL (MAXSLP * hz / 2) /* * Limit swapper to swap in one non-WKILLED process in MAXSLP/2 * interval, assuming that there is: * - at least one domain that is not suffering from a shortage of free memory; * - no parallel swap-ins; * - no other swap-ins in the current SWAPIN_INTERVAL. */ static bool swapper_wkilled_only(void) { return (vm_page_count_min_set(&all_domains) || swap_inprogress > 0 || (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL); } void swapper(void) { struct proc *p; for (;;) { sx_slock(&allproc_lock); p = swapper_selector(swapper_wkilled_only()); sx_sunlock(&allproc_lock); if (p == NULL) { tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL); } else { PROC_LOCK_ASSERT(p, MA_OWNED); /* * Another process may be bringing or may have * already brought this process in while we * traverse all threads. Or, this process may * have exited or even being swapped out * again. */ if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) { faultin(p); } PROC_UNLOCK(p); } } } /* * First, if any processes have been sleeping or stopped for at least * "swap_idle_threshold1" seconds, they are swapped out. If, however, * no such processes exist, then the longest-sleeping or stopped * process is swapped out. Finally, and only as a last resort, if * there are no sleeping or stopped processes, the longest-resident * process is swapped out. */ static void swapout_procs(int action) { struct proc *p; struct thread *td; int slptime; bool didswap, doswap; MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0); didswap = false; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * Filter out not yet fully constructed processes. Do * not swap out held processes. Avoid processes which * are system, exiting, execing, traced, already swapped * out or are in the process of being swapped in or out. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag & (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE | P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) != P_INMEM) { PROC_UNLOCK(p); continue; } /* * Further consideration of this process for swap out * requires iterating over its threads. We release * allproc_lock here so that process creation and * destruction are not blocked while we iterate. * * To later reacquire allproc_lock and resume * iteration over the allproc list, we will first have * to release the lock on the process. We place a * hold on the process so that it remains in the * allproc list while it is unlocked. */ _PHOLD_LITE(p); sx_sunlock(&allproc_lock); /* * Do not swapout a realtime process. * Guarantee swap_idle_threshold1 time in memory. * If the system is under memory stress, or if we are * swapping idle processes >= swap_idle_threshold2, * then swap the process out. */ doswap = true; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); slptime = (ticks - td->td_slptick) / hz; if (PRI_IS_REALTIME(td->td_pri_class) || slptime < swap_idle_threshold1 || !thread_safetoswapout(td) || ((action & VM_SWAP_NORMAL) == 0 && slptime < swap_idle_threshold2)) doswap = false; thread_unlock(td); if (!doswap) break; } if (doswap && swapout(p) == 0) didswap = true; PROC_UNLOCK(p); if (didswap) { sx_xlock(&allproc_lock); swapped_cnt++; sx_downgrade(&allproc_lock); } else sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapclear(struct proc *p) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_flags |= TDF_INMEM; td->td_flags &= ~TDF_SWAPINREQ; TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) if (setrunnable(td)) { #ifdef INVARIANTS /* * XXX: We just cleared TDI_SWAPPED * above and set TDF_INMEM, so this * should never happen. */ panic("not waking up swapper"); #endif } thread_unlock(td); } p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT); p->p_flag |= P_INMEM; } static int swapout(struct proc *p) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) == P_INMEM, ("swapout: lost a swapout race?")); /* * Remember the resident count. */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); /* * Check and mark all threads before we proceed. */ p->p_flag &= ~P_INMEM; p->p_flag |= P_SWAPPINGOUT; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!thread_safetoswapout(td)) { thread_unlock(td); swapclear(p); return (EBUSY); } td->td_flags &= ~TDF_INMEM; TD_SET_SWAPPED(td); thread_unlock(td); } td = FIRST_THREAD_IN_PROC(p); ++td->td_ru.ru_nswap; PROC_UNLOCK(p); /* * This list is stable because all threads are now prevented from * running. The list is only modified in the context of a running * thread in this process. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); p->p_flag &= ~P_SWAPPINGOUT; p->p_swtick = ticks; return (0); }