Index: stable/11/sys/amd64/amd64/pmap.c =================================================================== --- stable/11/sys/amd64/amd64/pmap.c (revision 318715) +++ stable/11/sys/amd64/amd64/pmap.c (revision 318716) @@ -1,7338 +1,7320 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = X86_PG_G; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } extern struct pcpu __pcpu[]; #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { int i; uint64_t res; res = 0; CPU_FOREACH(i) { res += cpuid_to_pcpu[i]->pc_pm_save_cnt; } return (sysctl_handle_64(oidp, &res, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; static u_long pmap_invl_gen = 0; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(curthread->td_md.md_invl_gen.gen == 0, ("DI already started")) /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_started(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finished(void) { struct pmap_invl_gen *invl_gen, *next; struct turnstile *ts; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_started")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) { turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); pmap_invl_gen = invl_gen->gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts, TS_SHARED_LOCK); } turnstile_chain_unlock(&invl_gen_ts); } else { next->gen = invl_gen->gen; } LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, "Number of times DI invalidation blocked pmap_remove_all/write"); #endif static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait(vm_page_t m) { struct thread *td; struct turnstile *ts; u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif td = curthread; m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { atomic_add_long(&invl_wait, 1); accounted = true; } #endif ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > pmap_invl_gen) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finished(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); -static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); -static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance * by using a large (2MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; newaddr = roundup2(addr, NBPDR); return (newaddr); } /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); return (PTmap + ((va >> PAGE_SHIFT) & mask)); } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); return (PDmap + ((va >> PDRSHIFT) & mask)); } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr, 1 << PDRSHIFT); pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe; pt_entry_t *pt_p; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) ndm1g = ptoa(Maxmem) >> PDPSHIFT; if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); /* Fill in the underlying page table pages */ /* Nominally read-only (but really R/W) from zero to physfree */ /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G; /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | PG_U; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | X86_PG_M | X86_PG_A; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | X86_PG_M | X86_PG_A; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; } } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte; int i; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* XXX do %cr0 as well */ load_cr4(rcr4() | CR4_PGE); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) load_cr4(rcr4() | CR4_SMEP); /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_cr3 = KPML4phys; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; virtual_avail = va; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { /* Check for INVPCID support */ invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } __pcpu[0].pc_pcid_next = PMAP_PCID_KERN + 1; __pcpu[0].pc_pcid_gen = 1; /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } else { pmap_pcid_enabled = 0; } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int error, i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, "1GB page mapping counters"); static u_long pmap_pdpe_demotions; SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, 0, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline boolean_t pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask; u_int cpuid, i; if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) invlpg(va); else if (pmap_pcid_enabled) pmap->pm_pcids[cpuid].pm_gen = 0; if (pmap_pcid_enabled) { CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } mask = &pmap->pm_active; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask; vm_offset_t addr; u_int cpuid, i; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); cpuid = PCPU_GET(cpuid); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else if (pmap_pcid_enabled) { pmap->pm_pcids[cpuid].pm_gen = 0; } if (pmap_pcid_enabled) { CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } mask = &pmap->pm_active; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask; struct invpcid_descr d; u_int cpuid, i; if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { if (invpcid_works) { d.pcid = pmap->pm_pcids[cpuid].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { load_cr3(pmap->pm_cr3 | pmap->pm_pcids [PCPU_GET(cpuid)].pm_pcid); } } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[cpuid].pm_gen = 0; } if (pmap_pcid_enabled) { CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } mask = &pmap->pm_active; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendevous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) invlpg(va); else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid); } } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)cpu_clflush_line_size; } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; /* * Otherwise, do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { if (pmap_kextract(sva) == lapic_paddr) return; /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* - * Looks for a page table page mapping the specified virtual address in the - * specified pmap's collection of idle page table pages. Returns NULL if there - * is no page table page corresponding to the specified virtual address. + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. */ static __inline vm_page_t -pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); + return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* - * Removes the specified page table page from the specified pmap's collection - * of idle page table pages. The specified page table page must be a member of - * the pmap's collection. - */ -static __inline void -pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) -{ - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - vm_radix_remove(&pmap->pm_root, mpte->pindex); -} - -/* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { int i; PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); pmap->pm_cr3 = KPML4phys; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } PCPU_SET(curpmap, kernel_pmap); pmap_activate(curthread); CPU_FILL(&kernel_pmap->pm_active); } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | PG_U; } for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V | PG_U; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg; vm_paddr_t pml4phys; int i; /* * allocate the page directory page */ while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; pml4phys = VM_PAGE_TO_PHYS(pml4pg); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = ~0; /* initialize to an invalid value */ if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. */ if ((pmap->pm_type = pm_type) == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); VM_WAIT; PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->wire_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->wire_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->wire_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); TAILQ_INIT(&new_tail); pmap_delayed_invl_started(); mtx_lock(&pv_chunks_mutex); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap_delayed_invl_finished(); pmap_delayed_invl_started(); pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); } else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); mtx_lock(&pv_chunks_mutex); continue; } PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); mtx_lock(&pv_chunks_mutex); continue; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; } TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap_delayed_invl_finished(); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; int avail, free; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Conditionally create the PV entry for a 2MB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; int PG_PTE_CACHE; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); - if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != - NULL) - pmap_remove_pt_page(pmap, mpte); - else { + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed or the allocation of the new * page table page fails. If the 2MB page mapping belongs to * the direct map region of the kernel's address space, then * the page allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is * normal. Page table pages are preallocated for every other * part of the kernel address space, so the direct map region * is the only part of the kernel address space that must be * handled here. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); pmap_invalidate_range(pmap, sva, sva + NBPDR - 1); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap_resident_count_inc(pmap, 1); } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (va >= VM_MAXUSER_ADDRESS) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mpte = pmap_lookup_pt_page(pmap, va); + mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); - pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * Initialize the page table page. */ pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * When workaround_erratum383 is false, a promotion to a 2M * page mapping does not invalidate the 512 4K page mappings * from the TLB. Consequently, at this point, the TLB may * hold both 4K and 2M page mappings. Therefore, the entire * range of addresses must be invalidated here. In contrast, * when workaround_erratum383 is true, a promotion does * invalidate the 512 4K page mappings, and so a single INVLPG * suffices to invalidate the 2M page mapping. */ if ((oldpde & PG_G) != 0) { if (workaround_erratum383) pmap_invalidate_page(kernel_pmap, sva); else pmap_invalidate_range(kernel_pmap, sva, sva + NBPDR - 1); } pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { - mpte = pmap_lookup_pt_page(pmap, sva); + mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va, va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_started(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if ((*pte & PG_G) == 0) anyvalid = 1; else if (va == va_next) va = sva; if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, &lock)) { sva += PAGE_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finished(); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { if (!atomic_cmpset_long(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) { /* See pmap_remove_pde() for explanation. */ if (workaround_erratum383) pmap_invalidate_page(kernel_pmap, sva); else pmap_invalidate_range(kernel_pmap, sva, sva + NBPDR - 1); } else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PS | newpde); atomic_add_long(&pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; mpte = NULL; lock = NULL; PMAP_LOCK(pmap); /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); opa = origpte & PG_FRAME; if (opa != pa) { if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pmap_pvh_free(&om->md, pmap, va); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE * otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t *pde, newpde; pt_entry_t PG_V; vm_page_t mpde; struct spglist free; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); pde = &pde[pmap_pde_index(va)]; if ((*pde & PG_V) != 0) { KASSERT(mpde->wire_count > 1, ("pmap_enter_pde: mpde's wire count is too low")); mpde->wire_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpde, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. */ pde_store(pde, newpde); atomic_add_long(&pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_pde(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pt_entry_t *pte, PG_V; vm_paddr_t pa; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pdpg = pmap_allocpde(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(addr)]; if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->wire_count--; KASSERT(pdpg->wire_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_started()/finished() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; pt_entry_t PG_A, PG_M, PG_V; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpde, dstmpte, srcmpte; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t srcptepaddr, *pde; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) { va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; dstmpde = pmap_allocpde(dst_pmap, addr, NULL); if (dstmpde == NULL) break; pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); pde = &pde[pmap_pde_index(addr)]; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME, &lock))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else dstmpde->wire_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; while (addr < va_next) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { if (dstmpte != NULL && dstmpte->pindex == pmap_pde_pindex(addr)) dstmpte->wire_count++; else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not * mapped, paging-structure * caches could nonetheless * have entries that refer to * the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } - mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); pmap_free_zero_pages(&free); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; boolean_t anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; pmap_delayed_invl_started(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pde_to_pte(pde, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, *pde, NULL, &lock); anychanged = TRUE; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finished(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { while (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~mask; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size < dmaplimit) { va = PHYS_TO_DMAP(pa); if (!pmap_change_attr(va, size, mode)) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + tmpsize, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t mpdepa; vm_page_t mpde; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } mpdepa = VM_PAGE_TO_PHYS(mpde); firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); pmap_pdpe_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) return (EINVAL); if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) return (EINVAL); if (*pde & PG_PS) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pdpe, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte, X86_PG_PTE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V)) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN || pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x", cpuid, pcid_next)); if (pcid_next == PMAP_PCID_OVERMAX) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; uint64_t cached, cr3; register_t rflags; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) return; cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif cr3 = rcr3(); if (pmap_pcid_enabled) { cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); /* * If the INVPCID instruction is not available, * invltlb_pcid_handler() is used for handle * invalidate_all IPI, which checks for curpmap == * smp_tlb_pmap. Below operations sequence has a * window where %CR3 is loaded with the new pmap's * PML4 address, but curpmap value is not yet updated. * This causes invltlb IPI handler, called between the * updates, to execute as NOP, which leaves stale TLB * entries. * * Note that the most typical use of * pmap_activate_sw(), from the context switch, is * immune to this race, because interrupts are * disabled (while the thread lock is owned), and IPI * happends after curpmap is updated. Protect other * callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap * assignment. */ if (!invpcid_works) rflags = intr_disable(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); if (cached) PCPU_INC(pm_save_cnt); } PCPU_SET(curpmap, pmap); if (!invpcid_works) intr_restore(rflags); } else if (cr3 != pmap->pm_cr3) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; vm_page_t m, mpte; pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (have_addr) { va = (vm_offset_t)addr; pmap = PCPU_GET(curpmap); /* XXX */ } else { db_printf("show pte addr\n"); return; } PG_V = pmap_valid_bit(pmap); pml4 = pmap_pml4e(pmap, va); db_printf("VA %#016lx pml4e %#016lx", va, *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); db_printf(" pdpe %#016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); db_printf(" pde %#016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); db_printf(" pte %#016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } #endif Index: stable/11/sys/arm64/arm64/pmap.c =================================================================== --- stable/11/sys/arm64/arm64/pmap.c (revision 318715) +++ stable/11/sys/arm64/arm64/pmap.c (revision 318716) @@ -1,4751 +1,4736 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; struct msgbuf *msgbufp = NULL; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set(table, mask) atomic_set_64(table, mask) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load(table) (*table) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } #define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } static inline bool pmap_superpages_enabled(void) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_is_current(pmap_t pmap) { return ((pmap == pmap_kernel()) || (pmap == curthread->td_proc->p_vmspace->vm_map.pmap)); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } /* Is a level 1 or 2entry a valid block and cacheable */ CTASSERT(L1_BLOCK == L2_BLOCK); static __inline int pmap_pte_valid_cacheable(pt_entry_t pte) { return (((pte & ATTR_DESCR_MASK) == L1_BLOCK) && ((pte & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY))); } static __inline int pmap_l3_valid_cacheable(pt_entry_t l3) { return (((l3 & ATTR_DESCR_MASK) == L3_PAGE) && ((l3 & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY))); } #define PTE_SYNC(pte) cpu_dcache_wb_range((vm_offset_t)pte, sizeof(*pte)) /* * Checks if the page is dirty. We currently lack proper tracking of this on * arm64 so for now assume is a page mapped as rw was accessed it is. */ static inline int pmap_page_dirty(pt_entry_t pte) { return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) == (ATTR_AF | ATTR_AP(ATTR_AP_RW))); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; u_int l1_slot; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_load_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; cpu_dcache_wb_range((vm_offset_t)pagetable_dmap, PAGE_SIZE * DMAP_TABLES); cpu_tlb_flushID(); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_load_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); cpu_dcache_wb_range(l2_start, l2pt - l2_start); /* Flush the l1 table to ram */ cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l2pt, l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2pt = (vm_offset_t)l2; l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_load_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); cpu_dcache_wb_range(l3_start, l3pt - l3_start); cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot; uint64_t kern_delta; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t pa, max_pa, min_pa; int i; kern_delta = KERNBASE - kernstart; physmem = 0; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* Assume the address we were loaded to is a valid physical address */ min_pa = max_pa = KERNBASE - kern_delta; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); va = KERNBASE; pa = KERNBASE - kern_delta; /* * Start to initialise phys_avail by copying from physmap * up to the physical address KERNBASE points at. */ map_slot = avail_slot = 0; for (; map_slot < (physmap_idx * 2) && avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; if (physmap[map_slot] <= pa && physmap[map_slot + 1] > pa) break; phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } /* Add the memory before the kernel */ if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = pa; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } used_map_slot = map_slot; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); /* Finish initialising physmap */ map_slot = used_map_slot; for (; avail_slot < (PHYS_AVAIL_SIZE - 2) && map_slot < (physmap_idx * 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; /* Have we used the current range? */ if (physmap[map_slot + 1] <= pa) continue; /* Do we need to split the entry? */ if (physmap[map_slot] < pa) { phys_avail[avail_slot] = pa; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } else { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } phys_avail[avail_slot] = 0; phys_avail[avail_slot + 1] = 0; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = atop(phys_avail[avail_slot - 1]); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; sched_pin(); dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); sched_unpin(); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_paddr_t pa; vm_page_t m; int lvl; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { switch(lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } if (vm_page_pa_tryrelock(pmap, (tpte & ~ATTR_MASK) | off, &pa)) goto retry; m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); vm_page_hold(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pa = 0; pte = pmap_pte(kernel_pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_kextract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_kextract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_kextract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ static void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE); PTE_SYNC(pte); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); if (pmap_l3_valid_cacheable(pmap_load(pte))) cpu_dcache_wb_range(va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_load_clear(pte); PTE_SYNC(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); PTE_SYNC(pte); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { if (pmap_l3_valid_cacheable(pmap_load(pte))) cpu_dcache_wb_range(va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_load_clear(l0); PTE_SYNC(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_load_clear(l1); PTE_SYNC(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_load_clear(l2); PTE_SYNC(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing an l3 entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; pmap->pm_root.rt_root = 0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); VM_WAIT; PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); PTE_SYNC(l0); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { --m->wire_count; /* XXX: release mem barrier? */ atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); PTE_SYNC(l1); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { --m->wire_count; /* XXX: release mem barrier? */ atomic_subtract_int( &vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); PTE_SYNC(l2); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l1, paddr | L1_TABLE); PTE_SYNC(l1); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & ATTR_AF) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l2, paddr | L2_TABLE); PTE_SYNC(l2); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("ARM64TODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; int avail, free; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(va, L3_SIZE); old_l3 = pmap_load_clear(l3); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_page_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_l3(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr, *l3; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { /* TODO: Add pmap_remove_l2 */ if (pmap_demote_l2_locked(pmap, l2, sva & ~L2_OFFSET, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (l3 == NULL) panic("l3 == NULL"); if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(tpte)) cpu_dcache_wb_range(pv->pv_va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_page_dirty(tpte)) vm_page_dirty(m); pmap_unuse_l3(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_free_zero_pages(&free); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & VM_PROT_WRITE) == VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { l3p = pmap_demote_l2(pmap, l2, sva); if (l3p == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); if (pmap_l3_valid(l3)) { if ((l3 & ATTR_SW_MANAGED) && pmap_page_dirty(l3)) { vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); } pmap_set(l3p, ATTR_AP(ATTR_AP_RO)); PTE_SYNC(l3p); /* XXX: Use pmap_invalidate_range */ pmap_invalidate_page(pmap, va); } } } PMAP_UNLOCK(pmap); /* TODO: Only invalidate entries we are touching */ pmap_invalidate_all(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* - * Looks for a page table page mapping the specified virtual address in the - * specified pmap's collection of idle page table pages. Returns NULL if there - * is no page table page corresponding to the specified virtual address. + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. */ static __inline vm_page_t -pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - return (vm_radix_lookup(&pmap->pm_root, pmap_l2_pindex(va))); + return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* - * Removes the specified page table page from the specified pmap's collection - * of idle page table pages. The specified page table page must be a member of - * the pmap's collection. - */ -static __inline void -pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) -{ - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - vm_radix_remove(&pmap->pm_root, mpte->pindex); -} - -/* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); critical_enter(); /* Clear the old mapping */ pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_range(pmap, va, va + size); /* Create the new mapping */ pmap_load_store(pte, newpte); PTE_SYNC(pte); critical_exit(); intr_restore(intr); } /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); /* Check the alingment is valid */ if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa; vm_page_t mpte, om, l1_m, l2_m, l3_m; boolean_t nosleep; int lvl; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if ((va >> 63) == 0) new_l3 |= ATTR_AP(ATTR_AP_USER); CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); mpte = NULL; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } pde = pmap_pde(pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } else { /* * If we get a level 2 pde it must point to a level 3 entry * otherwise we will need to create the intermediate tables */ if (lvl < 2) { switch(lvl) { default: case -1: /* Get the l0 pde to update */ pde = pmap_l0(pmap, va); KASSERT(pde != NULL, ("...")); l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l1_m == NULL) panic("pmap_enter: l1 pte_m == NULL"); if ((l1_m->flags & PG_ZERO) == 0) pmap_zero_page(l1_m); l1_pa = VM_PAGE_TO_PHYS(l1_m); pmap_load_store(pde, l1_pa | L0_TABLE); PTE_SYNC(pde); /* FALLTHROUGH */ case 0: /* Get the l1 pde to update */ pde = pmap_l1_to_l2(pde, va); KASSERT(pde != NULL, ("...")); l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); pmap_load_store(pde, l2_pa | L1_TABLE); PTE_SYNC(pde); /* FALLTHROUGH */ case 1: /* Get the l2 pde to update */ pde = pmap_l1_to_l2(pde, va); l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); pmap_load_store(pde, l3_pa | L2_TABLE); PTE_SYNC(pde); break; } } l3 = pmap_l2_to_l3(pde, va); pmap_invalidate_page(pmap, va); } havel3: om = NULL; orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0) { new_l3 |= ATTR_SW_MANAGED; if ((new_l3 & ATTR_AP(ATTR_AP_RW)) == ATTR_AP(ATTR_AP_RW)) { vm_page_aflag_set(m, PGA_WRITEABLE); } } goto validate; } /* Flush the cache, there might be uncommitted data in it */ if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(orig_l3)) cpu_dcache_wb_range(va, L3_SIZE); } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; pv = get_pv_entry(pmap, &lock); pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the L3 entry. */ if (orig_l3 != 0) { validate: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; if (opa != pa) { pmap_update_entry(pmap, l3, new_l3, va, PAGE_SIZE); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if (pmap_page_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pmap_pvh_free(&om->md, pmap, va); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else { pmap_load_store(l3, new_l3); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); if (pmap_page_dirty(orig_l3) && (orig_l3 & ATTR_SW_MANAGED) != 0) vm_page_dirty(m); } } else { pmap_load_store(l3, new_l3); } PTE_SYNC(l3); pmap_invalidate_page(pmap, va); if (pmap != pmap_kernel()) { if (pmap == &curproc->p_vmspace->vm_pmap && (prot & VM_PROT_EXECUTE) != 0) cpu_icache_sync_range(va, PAGE_SIZE); if ((mpte == NULL || mpte->wire_count == NL3PG) && pmap_superpages_enabled() && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l2, *l3; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) pa |= ATTR_SW_MANAGED; pmap_load_store(l3, pa); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { l3 = pmap_demote_l2(pmap, l2, sva); if (l3 == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); tpte = pmap_load(pte); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); if (pmap_is_current(pmap)) { if (lvl == 2 && pmap_l3_valid_cacheable(tpte)) { cpu_dcache_wb_range(pv->pv_va, L3_SIZE); } else if (lvl == 1 && pmap_pte_valid_cacheable(tpte)) { cpu_dcache_wb_range(pv->pv_va, L2_SIZE); } } pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_page(pmap, pv->pv_va); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(m); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } - ml3 = pmap_lookup_pt_page(pmap, + ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { - pmap_remove_pt_page(pmap, ml3); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); atomic_subtract_int( &vm_cnt.v_wire_count, 1); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_l3(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); retry: oldpte = pmap_load(pte); if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { if (!atomic_cmpset_long(pte, oldpte, oldpte | ATTR_AP(ATTR_AP_RO))) goto retry; if ((oldpte & ATTR_AF) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { return (FALSE); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; pt_entry_t *l3; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, lvl, pvh_gen; struct spglist free; bool demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { if (safe_to_clear_referenced(pmap, tpte)) { /* * TODO: We don't handle the access * flag at all. We need to be able * to set it in the exception handler. */ panic("ARM64TODO: " "safe_to_clear_referenced\n"); } else if (pmap_demote_l2_locked(pmap, pte, pv->pv_va, &lock) != NULL) { demoted = true; va += VM_PAGE_TO_PHYS(m) - (tpte & ~ATTR_MASK); l3 = pmap_l2_to_l3(pte, va); pmap_remove_l3(pmap, l3, va, pmap_load(pte), NULL, &lock); } else demoted = true; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if ((tpte & ATTR_AF) != 0) { if (safe_to_clear_referenced(pmap, tpte)) { /* * TODO: We don't handle the access flag * at all. We need to be able to set it in * the exception handler. */ panic("ARM64TODO: safe_to_clear_referenced\n"); } else if ((tpte & ATTR_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_l3(pmap, pte, pv->pv_va, tpde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); pmap_free_zero_pages(&free); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; /* ARM64TODO: We lack support for tracking if a page is modified */ } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, va, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva & ~L2_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_IDX_MASK; l3 |= ATTR_IDX(mode); pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE); KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys, phys; vm_page_t ml3; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); KASSERT((va & L2_OFFSET) == 0, ("pmap_demote_l2: Invalid virtual address %#lx", va)); tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } - if ((ml3 = pmap_lookup_pt_page(pmap, va)) != NULL) { - pmap_remove_pt_page(pmap, ml3); - } else { + if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (ml3 == NULL) { CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) pmap_resident_count_inc(pmap, 1); } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); /* Address the range points at */ phys = oldl2 & ~ATTR_MASK; /* The attributed from the old l2 table to be copied */ newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE; /* * If the page table page is new, initialize it. */ if (ml3->wire_count == 1) { for (i = 0; i < Ln_ENTRIES; i++) { l3[i] = newl3 | phys; phys += L3_SIZE; } cpu_dcache_wb_range((vm_offset_t)l3, PAGE_SIZE); } KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE), ("Invalid l3 page (%lx != %lx)", l3[0], (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE)); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *l1p, l1; pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; bool managed; int val; PMAP_LOCK(pmap); retry: pa = 0; val = 0; managed = false; l1p = pmap_l1(pmap, addr); if (l1p == NULL) /* No l1 */ goto done; l1 = pmap_load(l1p); if ((l1 & ATTR_DESCR_MASK) == L1_INVAL) goto done; if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) { pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET); managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_SUPER | MINCORE_INCORE; if (pmap_page_dirty(l1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l1 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; goto done; } l2p = pmap_l1_to_l2(l1p, addr); if (l2p == NULL) /* No l2 */ goto done; l2 = pmap_load(l2p); if ((l2 & ATTR_DESCR_MASK) == L2_INVAL) goto done; if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) { pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET); managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_SUPER | MINCORE_INCORE; if (pmap_page_dirty(l2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l2 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; goto done; } l3p = pmap_l2_to_l3(l2p, addr); if (l3p == NULL) /* No l3 */ goto done; l3 = pmap_load(l2p); if ((l3 & ATTR_DESCR_MASK) == L3_INVAL) goto done; if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) { pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET); managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_INCORE; if (pmap_page_dirty(l3)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l3 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } done: if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_pcb->pcb_l0addr = vtophys(pmap->pm_l0); __asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_pcb->pcb_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { #ifdef SMP uint64_t par; #endif switch (ESR_ELx_EXCEPTION(esr)) { case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (KERN_FAILURE); } #ifdef SMP PMAP_LOCK(pmap); switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: /* Ask the MMU to check the address */ if (pmap == kernel_pmap) par = arm64_address_translate_s1e1r(far); else par = arm64_address_translate_s1e0r(far); /* * If the translation was successful the address was invalid * due to a break-before-make sequence. We can unlock and * return success to the trap handler. */ if (PAR_SUCCESS(par)) { PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } break; default: break; } PMAP_UNLOCK(pmap); #endif return (KERN_FAILURE); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } Index: stable/11/sys/cddl/compat/opensolaris/sys/vnode.h =================================================================== --- stable/11/sys/cddl/compat/opensolaris/sys/vnode.h (revision 318715) +++ stable/11/sys/cddl/compat/opensolaris/sys/vnode.h (revision 318716) @@ -1,287 +1,286 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_VNODE_H_ #define _OPENSOLARIS_SYS_VNODE_H_ #ifdef _KERNEL struct vnode; struct vattr; typedef struct vnode vnode_t; typedef struct vattr vattr_t; typedef enum vtype vtype_t; #include enum symfollow { NO_FOLLOW = NOFOLLOW }; #include #include_next #include #include #include #include #include #include typedef struct vop_vector vnodeops_t; #define VOP_FID VOP_VPTOFH #define vop_fid vop_vptofh #define vop_fid_args vop_vptofh_args #define a_fid a_fhp #define IS_XATTRDIR(dvp) (0) #define v_count v_usecount #define V_APPEND VAPPEND #define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount) static __inline int vn_is_readonly(vnode_t *vp) { return (vp->v_mount->mnt_flag & MNT_RDONLY); } #define vn_vfswlock(vp) (0) #define vn_vfsunlock(vp) do { } while (0) #define vn_ismntpt(vp) ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) #define vn_mountedvfs(vp) ((vp)->v_mountedhere) #define vn_has_cached_data(vp) \ ((vp)->v_object != NULL && \ - ((vp)->v_object->resident_page_count > 0 || \ - !vm_object_cache_is_empty((vp)->v_object))) + (vp)->v_object->resident_page_count > 0) #define vn_exists(vp) do { } while (0) #define vn_invalid(vp) do { } while (0) #define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) #define vn_free(vp) do { } while (0) #define vn_matchops(vp, vops) ((vp)->v_op == &(vops)) #define VN_HOLD(v) vref(v) #define VN_RELE(v) vrele(v) #define VN_URELE(v) vput(v) #define vnevent_create(vp, ct) do { } while (0) #define vnevent_link(vp, ct) do { } while (0) #define vnevent_remove(vp, dvp, name, ct) do { } while (0) #define vnevent_rmdir(vp, dvp, name, ct) do { } while (0) #define vnevent_rename_src(vp, dvp, name, ct) do { } while (0) #define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) #define vnevent_rename_dest_dir(vp, ct) do { } while (0) #define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) #define MANDMODE(mode) (0) #define MANDLOCK(vp, mode) (0) #define chklock(vp, op, offset, size, mode, ct) (0) #define cleanlocks(vp, pid, foo) do { } while (0) #define cleanshares(vp, pid) do { } while (0) /* * We will use va_spare is place of Solaris' va_mask. * This field is initialized in zfs_setattr(). */ #define va_mask va_spare /* TODO: va_fileid is shorter than va_nodeid !!! */ #define va_nodeid va_fileid /* TODO: This field needs conversion! */ #define va_nblocks va_bytes #define va_blksize va_blocksize #define va_seq va_gen #define MAXOFFSET_T OFF_MAX #define EXCL 0 #define ACCESSED (AT_ATIME) #define STATE_CHANGED (AT_CTIME) #define CONTENT_MODIFIED (AT_MTIME | AT_CTIME) static __inline void vattr_init_mask(vattr_t *vap) { vap->va_mask = 0; if (vap->va_type != VNON) vap->va_mask |= AT_TYPE; if (vap->va_uid != (uid_t)VNOVAL) vap->va_mask |= AT_UID; if (vap->va_gid != (gid_t)VNOVAL) vap->va_mask |= AT_GID; if (vap->va_size != (u_quad_t)VNOVAL) vap->va_mask |= AT_SIZE; if (vap->va_atime.tv_sec != VNOVAL) vap->va_mask |= AT_ATIME; if (vap->va_mtime.tv_sec != VNOVAL) vap->va_mask |= AT_MTIME; if (vap->va_mode != (u_short)VNOVAL) vap->va_mask |= AT_MODE; if (vap->va_flags != VNOVAL) vap->va_mask |= AT_XVATTR; } #define FCREAT O_CREAT #define FTRUNC O_TRUNC #define FEXCL O_EXCL #define FDSYNC FFSYNC #define FRSYNC FFSYNC #define FSYNC FFSYNC #define FOFFMAX 0x00 #define FIGNORECASE 0x00 static __inline int vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode, vnode_t **vpp, enum create crwhy, mode_t umask, struct vnode *startvp, int fd) { struct thread *td = curthread; struct nameidata nd; int error, operation; ASSERT(seg == UIO_SYSSPACE); if ((filemode & FCREAT) != 0) { ASSERT(filemode == (FWRITE | FCREAT | FTRUNC | FOFFMAX)); ASSERT(crwhy == CRCREAT); operation = CREATE; } else { ASSERT(filemode == (FREAD | FOFFMAX) || filemode == (FREAD | FWRITE | FOFFMAX)); ASSERT(crwhy == 0); operation = LOOKUP; } ASSERT(umask == 0); pwd_ensure_dirs(); if (startvp != NULL) vref(startvp); NDINIT_ATVP(&nd, operation, 0, UIO_SYSSPACE, pnamep, startvp, td); filemode |= O_NOFOLLOW; error = vn_open_cred(&nd, &filemode, createmode, 0, td->td_ucred, NULL); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == 0) { /* We just unlock so we hold a reference. */ VOP_UNLOCK(nd.ni_vp, 0); *vpp = nd.ni_vp; } return (error); } static __inline int zfs_vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, vnode_t **vpp, enum create crwhy, mode_t umask) { return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, umask, NULL, -1)); } #define vn_open(pnamep, seg, filemode, createmode, vpp, crwhy, umask) \ zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask)) #define RLIM64_INFINITY 0 static __inline int zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, int ulimit, cred_t *cr, ssize_t *residp) { struct thread *td = curthread; int error; ssize_t resid; ASSERT(ioflag == 0); ASSERT(ulimit == RLIM64_INFINITY); if (rw == UIO_WRITE) { ioflag = IO_SYNC; } else { ioflag = IO_DIRECT; } error = vn_rdwr(rw, vp, base, len, offset, seg, ioflag, cr, NOCRED, &resid, td); if (residp != NULL) *residp = (ssize_t)resid; return (error); } #define vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \ zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp)) static __inline int zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr) { struct mount *mp; int error; ASSERT(flag == FSYNC); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: return (error); } #define VOP_FSYNC(vp, flag, cr, ct) zfs_vop_fsync((vp), (flag), (cr)) static __inline int zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { int error; ASSERT(count == 1); ASSERT(offset == 0); error = vn_close(vp, flag, cr, curthread); return (error); } #define VOP_CLOSE(vp, oflags, count, offset, cr, ct) \ zfs_vop_close((vp), (oflags), (count), (offset), (cr)) static __inline int vn_rename(char *from, char *to, enum uio_seg seg) { ASSERT(seg == UIO_SYSSPACE); return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg)); } static __inline int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) { ASSERT(seg == UIO_SYSSPACE); ASSERT(dirflag == RMFILE); return (kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0)); } #endif /* _KERNEL */ #endif /* _OPENSOLARIS_SYS_VNODE_H_ */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 318715) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 318716) @@ -1,6079 +1,6075 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { #ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else *(offset_t *)data = off; #endif return (0); } #ifdef illumos case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } #endif } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considred clean despite have some dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); - } else if (pp == NULL) { - pp = vm_page_alloc(obj, OFF_TO_IDX(start), - VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED | - VM_ALLOC_SBUSY); } else { ASSERT(pp != NULL && !pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); vm_object_pip_subtract(pp->object, 1); } static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_lock(pp); vm_page_hold(pp); vm_page_unlock(pp); } else pp = NULL; break; } return (pp); } static void page_unhold(vm_page_t pp) { vm_page_lock(pp); vm_page_unhold(pp); vm_page_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } vm_object_pip_wakeupn(obj, 0); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_hold(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); #ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); #else error = vn_io_fault_uiomove(va + off, bytes, uio); #endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unhold(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); #ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* illumos */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ #ifdef illumos size_t cbytes; #endif abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); #ifdef illumos if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); #else ssize_t resid = uio->uio_resid; error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); if (error != 0) { uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; dmu_return_arcbuf(abuf); break; } #endif } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } #ifdef illumos ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); #endif } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); #ifdef illumos ASSERT(error == 0); #else ASSERT(error == 0 || error == EFAULT); #endif } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef illumos if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } #ifdef __FreeBSD__ /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } #endif if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT(zio != NULL); ASSERT(size != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= zp->z_blksz); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; int ltype; ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (dvp->v_iflag & VI_DOOMED) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* fast path (should be redundant with vfs namecache) */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { vrele(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } out: if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve(1); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: if (error == 0) { *vpp = ZTOV(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked, toobig = FALSE; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; zp = VTOZ(vp); xattr_obj = 0; xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *vpp = NULL; if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } cache_purge(dvp); error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); cache_purge(vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t links; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; #endif vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) links = zp->z_links + 1; else links = zp->z_links; vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ vap->va_size = zp->z_size; #ifdef illumos vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK(tdvp, 0); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(*svpp, 0); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; char *snm = scnp->cn_nameptr; char *tnm = tcnp->cn_nameptr; int error = 0; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_check(szp, sdzp, tdzp)) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_purge(*svpp); if (*tvpp != NULL) cache_purge(*tvpp); cache_purge_negative(tdvp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(*svpp, 0); VOP_UNLOCK(sdvp, 0); out: /* original two vnodes are locked */ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK(*tvpp, 0); if (tdvp != *tvpp) VOP_UNLOCK(tdvp, 0); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; #ifdef illumos if (fidp->fid_len < size) { fidp->fid_len = size; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = size; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; int error; switch (cmd) { case _PC_LINK_MAX: *valp = INT_MAX; return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #ifdef illumos case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { if (!zfs_dirempty(xzp)) *valp = 1; vrele(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); case _PC_SATTR_ENABLED: case _PC_SATTR_EXISTS: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_type == VREG || vp->v_type == VDIR); return (0); case _PC_ACCESS_FILTERING: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && vp->v_type == VDIR; return (0); case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); #endif /* illumos */ case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); #ifdef illumos case _PC_TIMESTAMP_RESOLUTION: /* nanosecond timestamp resolution */ *valp = 1L; return (0); #endif case _PC_ACL_EXTENDED: *valp = 0; return (0); case _PC_ACL_NFS4: *valp = 1; return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } static int zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; vm_page_t mlast; vm_object_t object; caddr_t va; struct sf_buf *sf; off_t startoff, endoff; int i, error; vm_pindex_t reqstart, reqend; int lsize, size; object = m[0]->object; error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_vmobject_wlock(object); if (m[count - 1]->valid != 0 && --count == 0) { zfs_vmobject_wunlock(object); goto out; } mlast = m[count - 1]; if (IDX_TO_OFF(mlast->pindex) >= object->un_pager.vnp.vnp_size) { zfs_vmobject_wunlock(object); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, count); lsize = PAGE_SIZE; if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex); zfs_vmobject_wunlock(object); for (i = 0; i < count; i++) { size = PAGE_SIZE; if (i == count - 1) size = lsize; va = zfs_map_page(m[i], &sf); error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), size, va, DMU_READ_PREFETCH); if (size != PAGE_SIZE) bzero(va + size, PAGE_SIZE - size); zfs_unmap_page(sf); if (error != 0) goto out; } zfs_vmobject_wlock(object); for (i = 0; i < count; i++) m[i]->valid = VM_PAGE_BITS_ALL; zfs_vmobject_wunlock(object); out: ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error == 0) { if (rbehind) *rbehind = 0; if (rahead) *rahead = 0; return (zfs_vm_pagerret_ok); } else return (zfs_vm_pagerret_error); } static int zfs_freebsd_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { i = 0; for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_range_unlock(rl); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } int zfs_freebsd_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL, NULL)); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0)); } static int zfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int error, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread); if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (error); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vattr_t *vap = ap->a_vap; xvattr_t xvap; u_long fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; u_long fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the security.jail.chflags_allowed sysctl is * is non-zero; otherwise, they behave like unprivileged * processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, ap->a_target, cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) *ap->a_retval = val; else if (error == EOPNOTSUPP) error = vop_stdpathconf(ap); return (error); } static int zfs_freebsd_fifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_ACL_PATH_MAX: case _PC_MAC_PRESENT: return (zfs_freebsd_pathconf(ap)); default: return (fifo_specops.vop_pathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to remove a named attribute. */ int zfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; u_char dirbuf[sizeof(struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof(attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { u_char nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof(dirbuf); auio.uio_resid = sizeof(dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof(dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done;) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } int zfs_freebsd_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } int zfs_freebsd_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); kmem_free(aaclp, aclbsize); return (error); } int zfs_freebsd_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp;; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); uint64_t parent; int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; vhold(covered_vp); ltype = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC static int zfs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { vnode_t *vp; znode_t *zp; int err; err = vop_stdlock(ap); if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { vp = ap->a_vp; zp = vp->v_data; if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_fifo_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; Index: stable/11/sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- stable/11/sys/fs/tmpfs/tmpfs_subr.c (revision 318715) +++ stable/11/sys/fs/tmpfs/tmpfs_subr.c (revision 318716) @@ -1,1857 +1,1854 @@ /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ /*- * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system supporting functions. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "tmpfs file system"); static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; static int sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) { int error; long pages, bytes; pages = *(long *)arg1; bytes = pages * PAGE_SIZE; error = sysctl_handle_long(oidp, &bytes, 0, req); if (error || !req->newptr) return (error); pages = bytes / PAGE_SIZE; if (pages < TMPFS_PAGES_MINRESERVED) return (EINVAL); *(long *)arg1 = pages; return (0); } SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, CTLTYPE_LONG|CTLFLAG_RW, &tmpfs_pages_reserved, 0, sysctl_mem_reserved, "L", "Amount of available memory and swap below which tmpfs growth stops"); static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b); RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); size_t tmpfs_mem_avail(void) { vm_ooffset_t avail; avail = swap_pager_avail + vm_cnt.v_free_count + vm_cnt.v_cache_count - tmpfs_pages_reserved; if (__predict_false(avail < 0)) avail = 0; return (avail); } size_t tmpfs_pages_used(struct tmpfs_mount *tmp) { const size_t node_size = sizeof(struct tmpfs_node) + sizeof(struct tmpfs_dirent); size_t meta_pages; meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, PAGE_SIZE); return (meta_pages + tmp->tm_pages_used); } static size_t tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) { if (tmpfs_mem_avail() < req_pages) return (0); if (tmp->tm_pages_max != ULONG_MAX && tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) return (0); return (1); } void tmpfs_ref_node(struct tmpfs_node *node) { TMPFS_NODE_LOCK(node); tmpfs_ref_node_locked(node); TMPFS_NODE_UNLOCK(node); } void tmpfs_ref_node_locked(struct tmpfs_node *node) { TMPFS_NODE_ASSERT_LOCKED(node); KASSERT(node->tn_refcount > 0, ("node %p zero refcount", node)); KASSERT(node->tn_refcount < UINT_MAX, ("node %p refcount %u", node, node->tn_refcount)); node->tn_refcount++; } /* * Allocates a new node of type 'type' inside the 'tmp' mount point, with * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', * using the credentials of the process 'p'. * * If the node type is set to 'VDIR', then the parent parameter must point * to the parent directory of the node being created. It may only be NULL * while allocating the root node. * * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter * specifies the device the node represents. * * If the node type is set to 'VLNK', then the parameter target specifies * the file name of the target file for the symbolic link that is being * created. * * Note that new nodes are retrieved from the available list if it has * items or, if it is empty, from the node pool as long as there is enough * space to create them. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type, uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, char *target, dev_t rdev, struct tmpfs_node **node) { struct tmpfs_node *nnode; vm_object_t obj; /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); KASSERT(tmp->tm_root == NULL || mp->mnt_writeopcount > 0, ("creating node not under vn_start_write")); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) return (ENOSPC); if (tmpfs_pages_check_avail(tmp, 1) == 0) return (ENOSPC); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { /* * When a new tmpfs node is created for fully * constructed mount point, there must be a parent * node, which vnode is locked exclusively. As * consequence, if the unmount is executing in * parallel, vflush() cannot reclaim the parent vnode. * Due to this, the check for MNTK_UNMOUNT flag is not * racy: if we did not see MNTK_UNMOUNT flag, then tmp * cannot be destroyed until node construction is * finished and the parent vnode unlocked. * * Tmpfs does not need to instantiate new nodes during * unmount. */ return (EBUSY); } nnode = (struct tmpfs_node *)uma_zalloc_arg(tmp->tm_node_pool, tmp, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; vfs_timestamp(&nnode->tn_atime); nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = nnode->tn_atime; nnode->tn_uid = uid; nnode->tn_gid = gid; nnode->tn_mode = mode; nnode->tn_id = alloc_unr(tmp->tm_ino_unr); nnode->tn_refcount = 1; /* Type-specific initialization. */ switch (nnode->tn_type) { case VBLK: case VCHR: nnode->tn_rdev = rdev; break; case VDIR: RB_INIT(&nnode->tn_dir.tn_dirhead); LIST_INIT(&nnode->tn_dir.tn_dupindex); MPASS(parent != nnode); MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; nnode->tn_dir.tn_readdir_lastn = 0; nnode->tn_dir.tn_readdir_lastp = NULL; nnode->tn_links++; TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); nnode->tn_dir.tn_parent->tn_links++; TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); break; case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: MPASS(strlen(target) < MAXPATHLEN); nnode->tn_size = strlen(target); nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME, M_WAITOK); memcpy(nnode->tn_link, target, nnode->tn_size); break; case VREG: obj = nnode->tn_reg.tn_aobj = vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0, NULL /* XXXKIB - tmpfs needs swap reservation */); VM_OBJECT_WLOCK(obj); /* OBJ_TMPFS is set together with the setting of vp->v_object */ vm_object_set_flag(obj, OBJ_NOSPLIT | OBJ_TMPFS_NODE); vm_object_clear_flag(obj, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(obj); break; default: panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type); } TMPFS_LOCK(tmp); LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); nnode->tn_attached = true; tmp->tm_nodes_inuse++; tmp->tm_refcount++; TMPFS_UNLOCK(tmp); *node = nnode; return (0); } /* * Destroys the node pointed to by node from the file system 'tmp'. * If the node references a directory, no entries are allowed. */ void tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) { TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(node); if (!tmpfs_free_node_locked(tmp, node, false)) { TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); } } bool tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, bool detach) { vm_object_t uobj; TMPFS_MP_ASSERT_LOCKED(tmp); TMPFS_NODE_ASSERT_LOCKED(node); KASSERT(node->tn_refcount > 0, ("node %p refcount zero", node)); node->tn_refcount--; if (node->tn_attached && (detach || node->tn_refcount == 0)) { MPASS(tmp->tm_nodes_inuse > 0); tmp->tm_nodes_inuse--; LIST_REMOVE(node, tn_entries); node->tn_attached = false; } if (node->tn_refcount > 0) return (false); #ifdef INVARIANTS MPASS(node->tn_vnode == NULL); MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); #endif TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VDIR: /* FALLTHROUGH */ case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: free(node->tn_link, M_TMPFSNAME); break; case VREG: uobj = node->tn_reg.tn_aobj; if (uobj != NULL) { atomic_subtract_long(&tmp->tm_pages_used, uobj->size); KASSERT((uobj->flags & OBJ_TMPFS) == 0, ("leaked OBJ_TMPFS node %p vm_obj %p", node, uobj)); vm_object_deallocate(uobj); } break; default: panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } free_unr(tmp->tm_ino_unr, node->tn_id); uma_zfree(tmp->tm_node_pool, node); TMPFS_LOCK(tmp); tmpfs_free_tmp(tmp); return (true); } static __inline uint32_t tmpfs_dirent_hash(const char *name, u_int len) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP hash &= 0xf; #endif if (hash < TMPFS_DIRCOOKIE_MIN) hash += TMPFS_DIRCOOKIE_MIN; return (hash); } static __inline off_t tmpfs_dirent_cookie(struct tmpfs_dirent *de) { if (de == NULL) return (TMPFS_DIRCOOKIE_EOF); MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); return (de->td_cookie); } static __inline boolean_t tmpfs_dirent_dup(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); } static __inline boolean_t tmpfs_dirent_duphead(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); } void tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) { de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); memcpy(de->ud.td_name, name, namelen); de->td_namelen = namelen; } /* * Allocates a new directory entry for the node node with a name of name. * The new directory entry is returned in *de. * * The link count of node is increased by one to reflect the new object * referencing it. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, const char *name, u_int len, struct tmpfs_dirent **de) { struct tmpfs_dirent *nde; nde = uma_zalloc(tmp->tm_dirent_pool, M_WAITOK); nde->td_node = node; if (name != NULL) { nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); tmpfs_dirent_init(nde, name, len); } else nde->td_namelen = 0; if (node != NULL) node->tn_links++; *de = nde; return 0; } /* * Frees a directory entry. It is the caller's responsibility to destroy * the node referenced by it if needed. * * The link count of node is decreased by one to reflect the removal of an * object that referenced it. This only happens if 'node_exists' is true; * otherwise the function will not access the node referred to by the * directory entry, as it may already have been released from the outside. */ void tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) { struct tmpfs_node *node; node = de->td_node; if (node != NULL) { MPASS(node->tn_links > 0); node->tn_links--; } if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) free(de->ud.td_name, M_TMPFSNAME); uma_zfree(tmp->tm_dirent_pool, de); } void tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) { ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); if (vp->v_type != VREG || obj == NULL) return; VM_OBJECT_WLOCK(obj); VI_LOCK(vp); vm_object_clear_flag(obj, OBJ_TMPFS); obj->un_pager.swp.swp_tmpfs = NULL; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(obj); } /* * Need to clear v_object for insmntque failure. */ static void tmpfs_insmntque_dtr(struct vnode *vp, void *dtr_arg) { tmpfs_destroy_vobject(vp, vp->v_object); vp->v_object = NULL; vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, struct vnode **vpp) { struct vnode *vp; struct tmpfs_mount *tm; vm_object_t object; int error; error = 0; tm = VFS_TO_TMPFS(mp); TMPFS_NODE_LOCK(node); tmpfs_ref_node_locked(node); loop: TMPFS_NODE_ASSERT_LOCKED(node); if ((vp = node->tn_vnode) != NULL) { MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); VI_LOCK(vp); if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || ((vp->v_iflag & VI_DOOMED) != 0 && (lkflag & LK_NOWAIT) != 0)) { VI_UNLOCK(vp); TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } if ((vp->v_iflag & VI_DOOMED) != 0) { VI_UNLOCK(vp); node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 0, "tmpfsE", 0); } goto loop; } TMPFS_NODE_UNLOCK(node); error = vget(vp, lkflag | LK_INTERLOCK, curthread); if (error == ENOENT) { TMPFS_NODE_LOCK(node); goto loop; } if (error != 0) { vp = NULL; goto out; } /* * Make sure the vnode is still there after * getting the interlock to avoid racing a free. */ if (node->tn_vnode == NULL || node->tn_vnode != vp) { vput(vp); TMPFS_NODE_LOCK(node); goto loop; } goto out; } if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } /* * otherwise lock the vp list while we call getnewvnode * since that can block. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = msleep((caddr_t) &node->tn_vpstate, TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); if (error != 0) goto out; goto loop; } else node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* Get a new vnode and associate it with our node. */ error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); if (error != 0) goto unlock; MPASS(vp != NULL); /* lkflag is ignored, the lock is exclusive */ (void) vn_lock(vp, lkflag | LK_RETRY); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VSOCK: break; case VFIFO: vp->v_op = &tmpfs_fifoop_entries; break; case VREG: object = node->tn_reg.tn_aobj; VM_OBJECT_WLOCK(object); VI_LOCK(vp); KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); vp->v_object = object; object->un_pager.swp.swp_tmpfs = vp; vm_object_set_flag(object, OBJ_TMPFS); VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); break; case VDIR: MPASS(node->tn_dir.tn_parent != NULL); if (node->tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT; break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); error = insmntque1(vp, mp, tmpfs_insmntque_dtr, NULL); if (error != 0) vp = NULL; unlock: TMPFS_NODE_LOCK(node); MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup((caddr_t) &node->tn_vpstate); } else TMPFS_NODE_UNLOCK(node); out: if (error == 0) { *vpp = vp; #ifdef INVARIANTS MPASS(*vpp != NULL && VOP_ISLOCKED(*vpp)); TMPFS_NODE_LOCK(node); MPASS(*vpp == node->tn_vnode); TMPFS_NODE_UNLOCK(node); #endif } tmpfs_free_node(tm, node); return (error); } /* * Destroys the association between the vnode vp and the node it * references. */ void tmpfs_free_vp(struct vnode *vp) { struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); TMPFS_NODE_ASSERT_LOCKED(node); node->tn_vnode = NULL; if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) wakeup(&node->tn_vnode); node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; vp->v_data = NULL; } /* * Allocates a new file of type 'type' and adds it to the parent directory * 'dvp'; this addition is done using the component name given in 'cnp'. * The ownership of the new file is automatically assigned based on the * credentials of the caller (through 'cnp'), the group is set based on * the parent directory and the mode is determined from the 'vap' argument. * If successful, *vpp holds a vnode to the newly created file and zero * is returned. Otherwise *vpp is NULL and the function returns an * appropriate error code. */ int tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, struct componentname *cnp, char *target) { int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; struct tmpfs_node *parent; ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); MPASS(cnp->cn_flags & HASBUF); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULL; /* If the entry we are creating is a directory, we cannot overflow * the number of links of its parent, because it will get a new * link. */ if (vap->va_type == VDIR) { /* Ensure that we do not overflow the maximum number of links * imposed by the system. */ MPASS(dnode->tn_links <= LINK_MAX); if (dnode->tn_links == LINK_MAX) { return (EMLINK); } parent = dnode; MPASS(parent != NULL); } else parent = NULL; /* Allocate a node that represents the new file. */ error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev, &node); if (error != 0) return (error); /* Allocate a directory entry that points to the new file. */ error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) { tmpfs_free_node(tmp, node); return (error); } /* Allocate a vnode for the new file. */ error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); if (error != 0) { tmpfs_free_dirent(tmp, de); tmpfs_free_node(tmp, node); return (error); } /* Now that all required items are allocated, we can proceed to * insert the new node into the directory, an operation that * cannot fail. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); return (0); } struct tmpfs_dirent * tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); dc->tdc_tree = de; if (de != NULL && tmpfs_dirent_duphead(de)) de = LIST_FIRST(&de->ud.td_duphead); dc->tdc_current = de; return (dc->tdc_current); } struct tmpfs_dirent * tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; MPASS(dc->tdc_tree != NULL); if (tmpfs_dirent_dup(dc->tdc_current)) { dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); if (dc->tdc_current != NULL) return (dc->tdc_current); } dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, dc->tdc_tree); if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); } /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ static struct tmpfs_dirent * tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) { struct tmpfs_dirent *de, dekey; dekey.td_hash = hash; de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); return (de); } /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ static struct tmpfs_dirent * tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, struct tmpfs_dir_cursor *dc) { struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; struct tmpfs_dirent *de, dekey; MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); if (cookie == node->tn_dir.tn_readdir_lastn && (de = node->tn_dir.tn_readdir_lastp) != NULL) { /* Protect against possible race, tn_readdir_last[pn] * may be updated with only shared vnode lock held. */ if (cookie == tmpfs_dirent_cookie(de)) goto out; } if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { LIST_FOREACH(de, &node->tn_dir.tn_dupindex, uh.td_dup.index_entries) { MPASS(tmpfs_dirent_dup(de)); if (de->td_cookie == cookie) goto out; /* dupindex list is sorted. */ if (de->td_cookie < cookie) { de = NULL; goto out; } } MPASS(de == NULL); goto out; } if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { de = NULL; } else { dekey.td_hash = cookie; /* Recover if direntry for cookie was removed */ de = RB_NFIND(tmpfs_dir, dirhead, &dekey); } dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); out: dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_dup(de)) dc->tdc_tree = tmpfs_dir_xlookup_hash(node, de->td_hash); return (dc->tdc_current); } /* * Looks for a directory entry in the directory represented by node. * 'cnp' describes the name of the entry to look for. Note that the . * and .. components are not allowed as they do not physically exist * within directories. * * Returns a pointer to the entry when found, otherwise NULL. */ struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, struct componentname *cnp) { struct tmpfs_dir_duphead *duphead; struct tmpfs_dirent *de; uint32_t hash; MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.'))); TMPFS_VALIDATE_DIR(node); hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); de = tmpfs_dir_xlookup_hash(node, hash); if (de != NULL && tmpfs_dirent_duphead(de)) { duphead = &de->ud.td_duphead; LIST_FOREACH(de, duphead, uh.td_dup.entries) { if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) break; } } else if (de != NULL) { if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) de = NULL; } if (de != NULL && f != NULL && de->td_node != f) de = NULL; return (de); } /* * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex * list, allocate new cookie value. */ static void tmpfs_dir_attach_dup(struct tmpfs_node *dnode, struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) { struct tmpfs_dir_duphead *dupindex; struct tmpfs_dirent *de, *pde; dupindex = &dnode->tn_dir.tn_dupindex; de = LIST_FIRST(dupindex); if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { if (de == NULL) nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; else nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } /* * Cookie numbers are near exhaustion. Scan dupindex list for unused * numbers. dupindex list is sorted in descending order. Keep it so * after inserting nde. */ while (1) { pde = de; de = LIST_NEXT(de, uh.td_dup.index_entries); if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { /* * Last element of the index doesn't have minimal cookie * value, use it. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } else if (de == NULL) { /* * We are so lucky have 2^30 hash duplicates in single * directory :) Return largest possible cookie value. * It should be fine except possible issues with * VOP_READDIR restart. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } if (de->td_cookie + 1 == pde->td_cookie || de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) continue; /* No hole or invalid cookie. */ nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); MPASS(pde->td_cookie > nde->td_cookie); MPASS(nde->td_cookie > de->td_cookie); LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } } /* * Attaches the directory entry de to the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_alloc_dirent. */ void tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; struct tmpfs_dirent *xde, *nde; ASSERT_VOP_ELOCKED(vp, __func__); MPASS(de->td_namelen > 0); MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); MPASS(de->td_cookie == de->td_hash); dnode = VP_TO_TMPFS_DIR(vp); dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; MPASS(!tmpfs_dirent_dup(de)); xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); if (xde != NULL && tmpfs_dirent_duphead(xde)) tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); else if (xde != NULL) { /* * Allocate new duphead. Swap xde with duphead to avoid * adding/removing elements with the same hash. */ MPASS(!tmpfs_dirent_dup(xde)); tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, &nde); /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ memcpy(nde, xde, sizeof(*xde)); xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; LIST_INIT(&xde->ud.td_duphead); xde->td_namelen = 0; xde->td_node = NULL; tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); } dnode->tn_size += sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; tmpfs_update(vp); } /* * Detaches the directory entry de from the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_free_dirent. */ void tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_mount *tmp; struct tmpfs_dir *head; struct tmpfs_node *dnode; struct tmpfs_dirent *xde; ASSERT_VOP_ELOCKED(vp, __func__); dnode = VP_TO_TMPFS_DIR(vp); head = &dnode->tn_dir.tn_dirhead; dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; if (tmpfs_dirent_dup(de)) { /* Remove duphead if de was last entry. */ if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); MPASS(tmpfs_dirent_duphead(xde)); } else xde = NULL; LIST_REMOVE(de, uh.td_dup.entries); LIST_REMOVE(de, uh.td_dup.index_entries); if (xde != NULL) { if (LIST_EMPTY(&xde->ud.td_duphead)) { RB_REMOVE(tmpfs_dir, head, xde); tmp = VFS_TO_TMPFS(vp->v_mount); MPASS(xde->td_node == NULL); tmpfs_free_dirent(tmp, xde); } } de->td_cookie = de->td_hash; } else RB_REMOVE(tmpfs_dir, head, de); dnode->tn_size -= sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; tmpfs_update(vp); } void tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) { struct tmpfs_dirent *de, *dde, *nde; RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); /* Node may already be destroyed. */ de->td_node = NULL; if (tmpfs_dirent_duphead(de)) { while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { LIST_REMOVE(dde, uh.td_dup.entries); dde->td_node = NULL; tmpfs_free_dirent(tmp, dde); } } tmpfs_free_dirent(tmp, de); } } /* * Helper function for tmpfs_readdir. Creates a '.' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); dent.d_fileno = node->tn_id; dent.d_type = DT_DIR; dent.d_namlen = 1; dent.d_name[0] = '.'; dent.d_name[1] = '\0'; dent.d_reclen = GENERIC_DIRSIZ(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_status(node, TMPFS_NODE_ACCESSED); return (error); } /* * Helper function for tmpfs_readdir. Creates a '..' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); /* * Return ENOENT if the current node is already removed. */ TMPFS_ASSERT_LOCKED(node); if (node->tn_dir.tn_parent == NULL) return (ENOENT); TMPFS_NODE_LOCK(node->tn_dir.tn_parent); dent.d_fileno = node->tn_dir.tn_parent->tn_id; TMPFS_NODE_UNLOCK(node->tn_dir.tn_parent); dent.d_type = DT_DIR; dent.d_namlen = 2; dent.d_name[0] = '.'; dent.d_name[1] = '.'; dent.d_name[2] = '\0'; dent.d_reclen = GENERIC_DIRSIZ(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_status(node, TMPFS_NODE_ACCESSED); return (error); } /* * Helper function for tmpfs_readdir. Returns as much directory entries * as can fit in the uio space. The read starts at uio->uio_offset. * The function returns 0 on success, -1 if there was not enough space * in the uio structure to hold the directory entry or an appropriate * error code if another error happens. */ int tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, int maxcookies, u_long *cookies, int *ncookies) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; off_t off; int error; TMPFS_VALIDATE_DIR(node); off = 0; /* * Lookup the node from the current offset. The starting offset of * 0 will lookup both '.' and '..', and then the first real entry, * or EOF if there are none. Then find all entries for the dir that * fit into the buffer. Once no more entries are found (de == NULL), * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next * call to return 0. */ switch (uio->uio_offset) { case TMPFS_DIRCOOKIE_DOT: error = tmpfs_dir_getdotdent(node, uio); if (error != 0) return (error); uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; if (cookies != NULL) cookies[(*ncookies)++] = off = uio->uio_offset; /* FALLTHROUGH */ case TMPFS_DIRCOOKIE_DOTDOT: error = tmpfs_dir_getdotdotdent(node, uio); if (error != 0) return (error); de = tmpfs_dir_first(node, &dc); uio->uio_offset = tmpfs_dirent_cookie(de); if (cookies != NULL) cookies[(*ncookies)++] = off = uio->uio_offset; /* EOF. */ if (de == NULL) return (0); break; case TMPFS_DIRCOOKIE_EOF: return (0); default: de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); if (de == NULL) return (EINVAL); if (cookies != NULL) off = tmpfs_dirent_cookie(de); } /* Read as much entries as possible; i.e., until we reach the end of * the directory or we exhaust uio space. */ do { struct dirent d; /* Create a dirent structure representing the current * tmpfs_node and fill it. */ if (de->td_node == NULL) { d.d_fileno = 1; d.d_type = DT_WHT; } else { d.d_fileno = de->td_node->tn_id; switch (de->td_node->tn_type) { case VBLK: d.d_type = DT_BLK; break; case VCHR: d.d_type = DT_CHR; break; case VDIR: d.d_type = DT_DIR; break; case VFIFO: d.d_type = DT_FIFO; break; case VLNK: d.d_type = DT_LNK; break; case VREG: d.d_type = DT_REG; break; case VSOCK: d.d_type = DT_SOCK; break; default: panic("tmpfs_dir_getdents: type %p %d", de->td_node, (int)de->td_node->tn_type); } } d.d_namlen = de->td_namelen; MPASS(de->td_namelen < sizeof(d.d_name)); (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); d.d_name[de->td_namelen] = '\0'; d.d_reclen = GENERIC_DIRSIZ(&d); /* Stop reading if the directory entry we are treating is * bigger than the amount of data that can be returned. */ if (d.d_reclen > uio->uio_resid) { error = EJUSTRETURN; break; } /* Copy the new dirent structure into the output buffer and * advance pointers. */ error = uiomove(&d, d.d_reclen, uio); if (error == 0) { de = tmpfs_dir_next(node, &dc); if (cookies != NULL) { off = tmpfs_dirent_cookie(de); MPASS(*ncookies < maxcookies); cookies[(*ncookies)++] = off; } } } while (error == 0 && uio->uio_resid > 0 && de != NULL); /* Skip setting off when using cookies as it is already done above. */ if (cookies == NULL) off = tmpfs_dirent_cookie(de); /* Update the offset and cache. */ uio->uio_offset = off; node->tn_dir.tn_readdir_lastn = off; node->tn_dir.tn_readdir_lastp = de; tmpfs_set_status(node, TMPFS_NODE_ACCESSED); return error; } int tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; int error; error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) return (error); tmpfs_dir_attach(dvp, de); return (0); } void tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); MPASS(de != NULL && de->td_node == NULL); tmpfs_dir_detach(dvp, de); tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); } /* * Resizes the aobj associated with the regular file pointed to by 'vp' to the * size 'newsize'. 'vp' must point to a vnode that represents a regular file. * 'newsize' must be positive. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) { struct tmpfs_mount *tmp; struct tmpfs_node *node; vm_object_t uobj; vm_page_t m; vm_pindex_t idx, newpages, oldpages; off_t oldsize; int base, rv; MPASS(vp->v_type == VREG); MPASS(newsize >= 0); node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_reg.tn_aobj; tmp = VFS_TO_TMPFS(vp->v_mount); /* * Convert the old and new sizes to the number of pages needed to * store them. It may happen that we do not need to do anything * because the last allocated page can accommodate the change on * its own. */ oldsize = node->tn_size; oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); MPASS(oldpages == uobj->size); newpages = OFF_TO_IDX(newsize + PAGE_MASK); if (newpages > oldpages && tmpfs_pages_check_avail(tmp, newpages - oldpages) == 0) return (ENOSPC); VM_OBJECT_WLOCK(uobj); if (newsize < oldsize) { /* * Zero the truncated part of the last page. */ base = newsize & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(newsize); retry: m = vm_page_lookup(uobj, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "tmfssz")) goto retry; MPASS(m->valid == VM_PAGE_BITS_ALL); } else if (vm_pager_has_page(uobj, idx, NULL, NULL)) { m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL); if (m == NULL) { VM_OBJECT_WUNLOCK(uobj); VM_WAIT; VM_OBJECT_WLOCK(uobj); goto retry; - } else if (m->valid != VM_PAGE_BITS_ALL) - rv = vm_pager_get_pages(uobj, &m, 1, - NULL, NULL); - else - /* A cached page was reactivated. */ - rv = VM_PAGER_OK; + } + rv = vm_pager_get_pages(uobj, &m, 1, NULL, + NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); if (ignerr) m = NULL; else { VM_OBJECT_WUNLOCK(uobj); return (EIO); } } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); vm_page_dirty(m); vm_pager_page_unswapped(m); } } /* * Release any swap space and free any whole pages. */ if (newpages < oldpages) { swap_pager_freespace(uobj, newpages, oldpages - newpages); vm_object_page_remove(uobj, newpages, 0, 0); } } uobj->size = newpages; VM_OBJECT_WUNLOCK(uobj); atomic_add_long(&tmp->tm_pages_used, newpages - oldpages); node->tn_size = newsize; return (0); } void tmpfs_check_mtime(struct vnode *vp) { struct tmpfs_node *node; struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "check_mtime"); if (vp->v_type != VREG) return; obj = vp->v_object; KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) == (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj")); /* unlocked read */ if ((obj->flags & OBJ_TMPFS_DIRTY) != 0) { VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_TMPFS_DIRTY) != 0) { obj->flags &= ~OBJ_TMPFS_DIRTY; node = VP_TO_TMPFS_NODE(vp); node->tn_status |= TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; } VM_OBJECT_WUNLOCK(obj); } } /* * Change flags of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chflags"); node = VP_TO_TMPFS_NODE(vp); if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((flags ^ node->tn_flags) & SF_SETTABLE)) return (EPERM); } node->tn_flags = flags; node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chflags2"); return (0); } /* * Change access mode on the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chmod"); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0)) return (EFTYPE); } if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID, 0); if (error) return (error); } node->tn_mode &= ~ALLPERMS; node->tn_mode |= mode & ALLPERMS; node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chmod2"); return (0); } /* * Change ownership of the given vnode. At least one of uid or gid must * be different than VNOVAL. If one is set to that value, the attribute * is unchanged. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; uid_t ouid; gid_t ogid; ASSERT_VOP_ELOCKED(vp, "chown"); node = VP_TO_TMPFS_NODE(vp); /* Assign default values if they are unknown. */ MPASS(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) uid = node->tn_uid; if (gid == VNOVAL) gid = node->tn_gid; MPASS(uid != VNOVAL && gid != VNOVAL); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if ((uid != node->tn_uid || (gid != node->tn_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0))) return (error); ogid = node->tn_gid; ouid = node->tn_uid; node->tn_uid = uid; node->tn_gid = gid; node->tn_status |= TMPFS_NODE_CHANGED; if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) node->tn_mode &= ~(S_ISUID | S_ISGID); } ASSERT_VOP_ELOCKED(vp, "chown2"); return (0); } /* * Change size of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chsize"); node = VP_TO_TMPFS_NODE(vp); /* Decide whether this is a valid operation based on the file type. */ error = 0; switch (vp->v_type) { case VDIR: return EISDIR; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VFIFO: /* Allow modifications of special files even if in the file * system is mounted read-only (we are not modifying the * files themselves, but the objects they represent). */ return 0; default: /* Anything else is unsupported. */ return EOPNOTSUPP; } /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = tmpfs_truncate(vp, size); /* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents * for us, as will update tn_status; no need to do that here. */ ASSERT_VOP_ELOCKED(vp, "chsize2"); return (error); } /* * Change access and modification times of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chtimes(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *l) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chtimes"); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = vn_utimes_perm(vp, vap, cred, l); if (error != 0) return (error); if (vap->va_atime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_ACCESSED; if (vap->va_mtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_birthtime = vap->va_birthtime; ASSERT_VOP_ELOCKED(vp, "chtimes2"); return (0); } void tmpfs_set_status(struct tmpfs_node *node, int status) { if ((node->tn_status & status) == status) return; TMPFS_NODE_LOCK(node); node->tn_status |= status; TMPFS_NODE_UNLOCK(node); } /* Sync timestamps */ void tmpfs_itimes(struct vnode *vp, const struct timespec *acc, const struct timespec *mod) { struct tmpfs_node *node; struct timespec now; ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); node = VP_TO_TMPFS_NODE(vp); if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) return; vfs_timestamp(&now); TMPFS_NODE_LOCK(node); if (node->tn_status & TMPFS_NODE_ACCESSED) { if (acc == NULL) acc = &now; node->tn_atime = *acc; } if (node->tn_status & TMPFS_NODE_MODIFIED) { if (mod == NULL) mod = &now; node->tn_mtime = *mod; } if (node->tn_status & TMPFS_NODE_CHANGED) node->tn_ctime = now; node->tn_status &= ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); TMPFS_NODE_UNLOCK(node); /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(node, sizeof(*node), 1, RANDOM_FS_ATIME); } void tmpfs_update(struct vnode *vp) { tmpfs_itimes(vp, NULL, NULL); } int tmpfs_truncate(struct vnode *vp, off_t length) { int error; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); if (length < 0) { error = EINVAL; goto out; } if (node->tn_size == length) { error = 0; goto out; } if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); error = tmpfs_reg_resize(vp, length, FALSE); if (error == 0) node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; out: tmpfs_update(vp); return (error); } static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) { if (a->td_hash > b->td_hash) return (1); else if (a->td_hash < b->td_hash) return (-1); return (0); } RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); Index: stable/11/sys/i386/i386/pmap.c =================================================================== --- stable/11/sys/i386/i386/pmap.c (revision 318715) +++ stable/11/sys/i386/i386/pmap.c (revision 318716) @@ -1,5723 +1,5705 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = NULL; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2; static pt_entry_t *PADDR1 = NULL, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); -static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); -static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); +static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } void invltlb_glob(void) { uint64_t cr4; if (pgeflag == 0) { invltlb(); } else { cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb_glob(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendevous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)cpu_clflush_line_size; } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || pmap == vmspace_pmap(curthread->td_proc->p_vmspace)); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* - * Looks for a page table page mapping the specified virtual address in the - * specified pmap's collection of idle page table pages. Returns NULL if there - * is no page table page corresponding to the specified virtual address. + * Removes the page table page mapping the specified virtual address from the + * specified pmap's collection of idle page table pages, and returns it. + * Otherwise, returns NULL if there is no page table page corresponding to the + * specified virtual address. */ static __inline vm_page_t -pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) +pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); - return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); + return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* - * Removes the specified page table page from the specified pmap's collection - * of idle page table pages. The specified page table page must be a member of - * the pmap's collection. - */ -static __inline void -pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) -{ - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - vm_radix_remove(&pmap->pm_root, mpte->pindex); -} - -/* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); - if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != - NULL) - pmap_remove_pt_page(pmap, mpte); - else { + if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == + NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); pmap_invalidate_range(pmap, sva, sva + NBPDR - 1); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - mpte = pmap_lookup_pt_page(pmap, va); + mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); - pmap_remove_pt_page(pmap, mpte); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. * * When workaround_erratum383 is false, a promotion to a 2M/4M * page mapping does not invalidate the 512/1024 4K page mappings * from the TLB. Consequently, at this point, the TLB may * hold both 4K and 2M/4M page mappings. Therefore, the entire * range of addresses must be invalidated here. In contrast, * when workaround_erratum383 is true, a promotion does * invalidate the 512/1024 4K page mappings, and so a single INVLPG * suffices to invalidate the 2M/4M page mapping. */ if ((oldpde & PG_G) != 0) { if (workaround_erratum383) pmap_invalidate_page(kernel_pmap, sva); else pmap_invalidate_range(kernel_pmap, sva, sva + NBPDR - 1); } pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { - mpte = pmap_lookup_pt_page(pmap, sva); + mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { if (!pde_cmpset(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) { /* See pmap_remove_pde() for explanation. */ if (workaround_erratum383) pmap_invalidate_page(kernel_pmap, sva); else pmap_invalidate_range(kernel_pmap, sva, sva + NBPDR - 1); } else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PS | newpde); else pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); pde = pmap_pde(pmap, va); if (va < VM_MAXUSER_ADDRESS) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page_idle: CMAP3 busy"); sched_pin(); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(CADDR3); pagezero(CADDR3); *CMAP3 = 0; sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } - mpte = pmap_lookup_pt_page(pmap, pv->pv_va); + mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { - pmap_remove_pt_page(pmap, mpte); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif Index: stable/11/sys/kern/kern_exec.c =================================================================== --- stable/11/sys/kern/kern_exec.c (revision 318715) +++ stable/11/sys/kern/kern_exec.c (revision 318716) @@ -1,1633 +1,1633 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (ma[0]->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(ma[0]); if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, - VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); + VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } vm_page_xunbusy(ma[0]); for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error) { vm_object_deallocate(obj); return (error); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. Returns zero if the allocation succeeds * and ENOMEM otherwise. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX); return (args->buf != NULL ? 0 : ENOMEM); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { kmap_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: stable/11/sys/kern/uipc_shm.c =================================================================== --- stable/11/sys/kern/uipc_shm.c (revision 318715) +++ stable/11/sys/kern/uipc_shm.c (revision 318716) @@ -1,1107 +1,1104 @@ /*- * Copyright (c) 2006, 2011 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2) and shm_unlink(2). While most of the implementation is * here, vm_mmap.c contains mapping logic changes. * * TODO: * * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1) * and ipcrm(1) be expanded or should new tools to manage both POSIX * kernel semaphores and POSIX shared memory be written? * * (2) Add support for this file type to fstat(1). * * (3) Resource limits? Does this need its own resource limits or are the * existing limits in mmap(2) sufficient? */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr *shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; /* File descriptor operations. */ struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (m->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(m); if (vm_pager_has_page(obj, idx, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { printf( "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", obj, idx, m->valid, rv); vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); vm_page_xunbusy(m); } vm_page_lock(m); vm_page_hold(m); if (m->queue == PQ_NONE) { vm_page_deactivate(m); } else { /* Requeue to maintain LRU ordering. */ vm_page_requeue(m); } vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = howmany(sb->st_size, sb->st_blksize); mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; object = shmfd->shm_object; VM_OBJECT_WLOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_WUNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_WUNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "shmtrc")) goto retry; } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL); if (m == NULL) { VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; - } else if (m->valid != VM_PAGE_BITS_ALL) - rv = vm_pager_get_pages(object, &m, 1, - NULL, NULL); - else - /* A cached page was reactivated. */ - rv = VM_PAGER_OK; + } + rv = vm_pager_get_pages(object, &m, 1, NULL, + NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = ptoa(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Attempt to reserve the swap */ delta = ptoa(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_WUNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_WUNLOCK(object); return (0); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; int ino; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); shmfd->shm_object->pg_color = 0; VM_OBJECT_WLOCK(shmfd->shm_object); vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; ino = alloc_unr(shm_ino_unr); if (ino == -1) shmfd->shm_ino = 0; else shmfd->shm_ino = ino; refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); if (shmfd->shm_ino != 0) free_unr(shm_ino_unr, shmfd->shm_ino); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL); KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized")); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); #ifdef KTRACE if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_SHMFD); return (error); } fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif shm_dotruncate(shmfd, 0); } if (error == 0) shm_hold(shmfd); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ int sys_shm_open(struct thread *td, struct shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); } int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; shmfd = fp->f_data; maxprot = VM_PROT_NONE; /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; /* Don't permit shared writable mappings on read-only descriptors. */ if ((flags & MAP_SHARED) != 0 && (maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - objsize) return (EINVAL); #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) return (error); #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, FALSE, td); if (error != 0) vm_object_deallocate(shmfd->shm_object); return (0); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct shmfd *shmfd; size_t pr_pathlen; kif->kf_type = KF_TYPE_SHM; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; /* XXX */ mtx_unlock(&shm_timestamp_lock); kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { sx_slock(&shm_dict_lock); if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&shm_dict_lock); } return (0); } Index: stable/11/sys/sys/vmmeter.h =================================================================== --- stable/11/sys/sys/vmmeter.h (revision 318715) +++ stable/11/sys/sys/vmmeter.h (revision 318716) @@ -1,207 +1,227 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vmmeter.h 8.2 (Berkeley) 7/10/94 * $FreeBSD$ */ #ifndef _SYS_VMMETER_H_ #define _SYS_VMMETER_H_ /* * This value is used by ps(1) to change sleep state flag from 'S' to * 'I' and by the sched process to set the alarm clock. */ #define MAXSLP 20 /* * System wide statistics counters. * Locking: * a - locked by atomic operations * c - constant after initialization * f - locked by vm_page_queue_free_mtx * p - locked by being in the PCPU and atomicity respect to interrupts * q - changes are synchronized by the corresponding vm_pagequeue lock */ struct vmmeter { /* * General system activity. */ u_int v_swtch; /* (p) context switches */ u_int v_trap; /* (p) calls to trap */ u_int v_syscall; /* (p) calls to syscall() */ u_int v_intr; /* (p) device interrupts */ u_int v_soft; /* (p) software interrupts */ /* * Virtual memory activity. */ u_int v_vm_faults; /* (p) address memory faults */ u_int v_io_faults; /* (p) page faults requiring I/O */ u_int v_cow_faults; /* (p) copy-on-writes faults */ u_int v_cow_optim; /* (p) optimized copy-on-writes faults */ u_int v_zfod; /* (p) pages zero filled on demand */ u_int v_ozfod; /* (p) optimized zero fill pages */ u_int v_swapin; /* (p) swap pager pageins */ u_int v_swapout; /* (p) swap pager pageouts */ u_int v_swappgsin; /* (p) swap pager pages paged in */ u_int v_swappgsout; /* (p) swap pager pages paged out */ u_int v_vnodein; /* (p) vnode pager pageins */ u_int v_vnodeout; /* (p) vnode pager pageouts */ u_int v_vnodepgsin; /* (p) vnode_pager pages paged in */ u_int v_vnodepgsout; /* (p) vnode pager pages paged out */ u_int v_intrans; /* (p) intransit blocking page faults */ - u_int v_reactivated; /* (f) pages reactivated from free list */ + u_int v_reactivated; /* (p) pages reactivated by the pagedaemon */ u_int v_pdwakeups; /* (p) times daemon has awaken from sleep */ u_int v_pdpages; /* (p) pages analyzed by daemon */ + u_int v_pdshortfalls; /* (p) page reclamation shortfalls */ u_int v_tcached; /* (p) total pages cached */ u_int v_dfree; /* (p) pages freed by daemon */ u_int v_pfree; /* (p) pages freed by exiting processes */ u_int v_tfree; /* (p) total pages freed */ /* * Distribution of page usages. */ u_int v_page_size; /* (c) page size in bytes */ u_int v_page_count; /* (c) total number of pages in system */ u_int v_free_reserved; /* (c) pages reserved for deadlock */ u_int v_free_target; /* (c) pages desired free */ u_int v_free_min; /* (c) pages desired free */ u_int v_free_count; /* (f) pages free */ u_int v_wire_count; /* (a) pages wired down */ u_int v_active_count; /* (q) pages active */ u_int v_inactive_target; /* (c) pages desired inactive */ u_int v_inactive_count; /* (q) pages inactive */ + u_int v_laundry_count; /* (q) pages eligible for laundering */ u_int v_cache_count; /* (f) pages on cache queue */ u_int v_pageout_free_min; /* (c) min pages reserved for kernel */ u_int v_interrupt_free_min; /* (c) reserved pages for int code */ u_int v_free_severe; /* (c) severe page depletion point */ /* * Fork/vfork/rfork activity. */ u_int v_forks; /* (p) fork() calls */ u_int v_vforks; /* (p) vfork() calls */ u_int v_rforks; /* (p) rfork() calls */ u_int v_kthreads; /* (p) fork() calls by kernel */ u_int v_forkpages; /* (p) VM pages affected by fork() */ u_int v_vforkpages; /* (p) VM pages affected by vfork() */ u_int v_rforkpages; /* (p) VM pages affected by rfork() */ u_int v_kthreadpages; /* (p) VM pages affected by fork() by kernel */ - u_int v_spare[2]; }; #ifdef _KERNEL extern struct vmmeter vm_cnt; extern int vm_pageout_wakeup_thresh; /* * Return TRUE if we are under our severe low-free-pages threshold * * This routine is typically used at the user<->system interface to determine * whether we need to block in order to avoid a low memory deadlock. */ static inline int vm_page_count_severe(void) { return (vm_cnt.v_free_severe > vm_cnt.v_free_count + vm_cnt.v_cache_count); } /* * Return TRUE if we are under our minimum low-free-pages threshold. * * This routine is typically used within the system to determine whether * we can execute potentially very expensive code in terms of memory. It * is also used by the pageout daemon to calculate when to sleep, when * to wake waiters up, and when (after making a pass) to become more * desperate. */ static inline int vm_page_count_min(void) { return (vm_cnt.v_free_min > vm_cnt.v_free_count + vm_cnt.v_cache_count); } /* * Return TRUE if we have not reached our free page target during * free page recovery operations. */ static inline int vm_page_count_target(void) { return (vm_cnt.v_free_target > vm_cnt.v_free_count + vm_cnt.v_cache_count); } /* * Return the number of pages we need to free-up or cache * A positive number indicates that we do not have enough free pages. */ static inline int vm_paging_target(void) { return (vm_cnt.v_free_target - (vm_cnt.v_free_count + vm_cnt.v_cache_count)); } /* * Returns TRUE if the pagedaemon needs to be woken up. */ static inline int vm_paging_needed(void) { return (vm_cnt.v_free_count + vm_cnt.v_cache_count < (u_int)vm_pageout_wakeup_thresh); } + +/* + * Return the number of pages we need to launder. + * A positive number indicates that we have a shortfall of clean pages. + */ +static inline int +vm_laundry_target(void) +{ + + return (vm_paging_target()); +} + +/* + * Obtain the value of a per-CPU counter. + */ +#define VM_METER_PCPU_CNT(member) \ + vm_meter_cnt(__offsetof(struct vmmeter, member)) + +u_int vm_meter_cnt(size_t); #endif /* systemwide totals computed every five seconds */ struct vmtotal { int16_t t_rq; /* length of the run queue */ int16_t t_dw; /* jobs in ``disk wait'' (neg priority) */ int16_t t_pw; /* jobs in page wait */ int16_t t_sl; /* jobs sleeping in core */ int16_t t_sw; /* swapped out runnable/short block jobs */ int32_t t_vm; /* total virtual memory */ int32_t t_avm; /* active virtual memory */ int32_t t_rm; /* total real memory in use */ int32_t t_arm; /* active real memory */ int32_t t_vmshr; /* shared virtual memory */ int32_t t_avmshr; /* active shared virtual memory */ int32_t t_rmshr; /* shared real memory */ int32_t t_armshr; /* active shared real memory */ int32_t t_free; /* free memory pages */ }; #endif Index: stable/11/sys/vm/_vm_radix.h =================================================================== --- stable/11/sys/vm/_vm_radix.h (revision 318715) +++ stable/11/sys/vm/_vm_radix.h (revision 318716) @@ -1,55 +1,51 @@ /* * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __VM_RADIX_H_ #define __VM_RADIX_H_ /* * Radix tree root. */ struct vm_radix { uintptr_t rt_root; - uint8_t rt_flags; }; - -#define RT_INSERT_INPROG 0x01 -#define RT_TRIE_MODIFIED 0x02 #ifdef _KERNEL static __inline boolean_t vm_radix_is_empty(struct vm_radix *rtree) { return (rtree->rt_root == 0); } #endif /* _KERNEL */ #endif /* !__VM_RADIX_H_ */ Index: stable/11/sys/vm/swap_pager.c =================================================================== --- stable/11/sys/vm/swap_pager.c (revision 318715) +++ stable/11/sys/vm/swap_pager.c (revision 318716) @@ -1,2815 +1,2818 @@ /*- * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_swap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, 16 * or 32 pages per allocation. * The 32-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 16 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif /* * The swblock structure maps an object and a small, fixed-size range * of page indices to disk addresses within a swap area. * The collection of these mappings is implemented as a hash table. * Unused disk addresses within a swap area are allocated and managed * using a blist. */ #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t)) #define SWAP_META_PAGES (SWB_NPAGES * 2) #define SWAP_META_MASK (SWAP_META_PAGES - 1) struct swblock { struct swblock *swb_hnext; vm_object_t swb_object; vm_pindex_t swb_index; int swb_count; daddr_t swb_pages[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static vm_ooffset_t swap_total; SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, "Total amount of available swap storage."); static vm_ooffset_t swap_reserved; SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, "Amount of swap storage needed to back all allocated anonymous memory."); static int overcommit = 0; SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) int swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } int swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { vm_ooffset_t r, s; int res, error; static int curfail; static struct timeval lastfail; struct uidinfo *uip; uip = cred->cr_ruidinfo; if (incr & PAGE_MASK) panic("swap_reserve: & PAGE_MASK"); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (0); } #endif res = 0; mtx_lock(&sw_dev_mtx); r = swap_reserved + incr; if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) { s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count; s *= PAGE_SIZE; } else s = 0; s += swap_total; if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s || (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) { res = 1; swap_reserved = r; } mtx_unlock(&sw_dev_mtx); if (res) { UIDINFO_VMSIZE_LOCK(uip); if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 && uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) res = 0; else uip->ui_vmsize += incr; UIDINFO_VMSIZE_UNLOCK(uip); if (!res) { mtx_lock(&sw_dev_mtx); swap_reserved -= incr; mtx_unlock(&sw_dev_mtx); } } if (!res && ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", uip->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (!res) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (res); } void swap_reserve_force(vm_ooffset_t incr) { struct uidinfo *uip; mtx_lock(&sw_dev_mtx); swap_reserved += incr; mtx_unlock(&sw_dev_mtx); #ifdef RACCT PROC_LOCK(curproc); racct_add_force(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); #endif uip = curthread->td_ucred->cr_ruidinfo; PROC_LOCK(curproc); UIDINFO_VMSIZE_LOCK(uip); uip->ui_vmsize += incr; UIDINFO_VMSIZE_UNLOCK(uip); PROC_UNLOCK(curproc); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curthread->td_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { struct uidinfo *uip; uip = cred->cr_ruidinfo; if (decr & PAGE_MASK) panic("swap_release: & PAGE_MASK"); mtx_lock(&sw_dev_mtx); if (swap_reserved < decr) panic("swap_reserved < decr"); swap_reserved -= decr; mtx_unlock(&sw_dev_mtx); UIDINFO_VMSIZE_LOCK(uip); if (uip->ui_vmsize < decr) printf("negative vmsize for uid = %d\n", uip->ui_uid); uip->ui_vmsize -= decr; UIDINFO_VMSIZE_UNLOCK(uip); racct_sub_cred(cred, RACCT_SWAP, decr); } #define SWM_FREE 0x02 /* free, period */ #define SWM_POP 0x04 /* pop out */ int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static int nsw_rcount; /* free read buffers */ static int nsw_wcount_sync; /* limit write buffers / synchronous */ static int nsw_wcount_async; /* limit write buffers / asynchronous */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static struct swblock **swhash; static int swhash_mask; static struct mtx swhash_mtx; static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swap_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ }; /* * dmmax is in page-sized chunks with the new swap system. It was * dev-bsized chunks in the old. dmmax is always a power of 2. * * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int dmmax; static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, int npages); static daddr_t swp_pager_getswapspace(int npages); /* * Metadata functions */ static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index); static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWP_PAGER_HASH() - hash swap meta data * * This is an helper function which hashes the swapblk given * the object and page index. It returns a pointer to a pointer * to the object, or a pointer to a NULL pointer if it could not * find a swapblk. */ static struct swblock ** swp_pager_hash(vm_object_t object, vm_pindex_t index) { struct swblock **pswap; struct swblock *swap; index &= ~(vm_pindex_t)SWAP_META_MASK; pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask]; while ((swap = *pswap) != NULL) { if (swap->swb_object == object && swap->swb_index == index ) { break; } pswap = &swap->swb_hnext; } return (pswap); } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); /* * Device Stripe, in PAGE_SIZE'd blocks */ dmmax = SWB_NPAGES * 2; } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array (MAXPHYS/PAGE_SIZE) and our locally defined * MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); mtx_lock(&pbuf_mtx); nsw_rcount = (nswbuf + 1) / 2; nsw_wcount_sync = (nswbuf + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_unlock(&pbuf_mtx); /* * Initialize our zone. Right now I'm just guessing on the number * we need based on the number of pages in the system. Each swblock * can hold 32 pages, so this is probably overkill. This reservation * is typically limited to around 32MB by default. */ n = vm_cnt.v_page_count / 2; if (maxswzone && n > maxswzone / sizeof(struct swblock)) n = maxswzone / sizeof(struct swblock); n2 = n; swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); if (swap_zone == NULL) panic("failed to create swap_zone."); do { if (uma_zone_reserve_kva(swap_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); if (n2 != n) printf("Swap zone entries reduced from %lu to %lu.\n", n2, n); swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblock); n2 = n; /* * Initialize our meta-data hash table. The swapper does not need to * be quite as efficient as the VM system, so we do not use an * oversized hash table. * * n: size of hash table, must be power of 2 * swhash_mask: hash table index mask */ for (n = 1; n < n2 / 8; n *= 2) ; swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO); swhash_mask = n - 1; mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF); } static vm_object_t swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK + size)); object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } object->un_pager.swp.swp_bcount = 0; return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for the requested number of pages. The starting * swap block number (a page index) is returned or SWAPBLK_NONE * if the allocation failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int npages) { daddr_t blk; struct swdevt *sp; int i; blk = SWAPBLK_NONE; mtx_lock(&sw_dev_mtx); sp = swdevhd; for (i = 0; i < nswapdev; i++) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if (!(sp->sw_flags & SW_CLOSING)) { blk = blist_alloc(sp->sw_blist, npages); if (blk != SWAPBLK_NONE) { blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); goto done; } } sp = TAILQ_NEXT(sp, sw_list); } if (swap_pager_full != 2) { printf("swap_pager_getswapspace(%d): failed\n", npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; done: mtx_unlock(&sw_dev_mtx); return (blk); } static int swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, int npages) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (blk >= sp->sw_first && blk < sp->sw_end) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { int n = 0; daddr_t blk = SWAPBLK_NONE; vm_pindex_t beg = start; /* save start index */ VM_OBJECT_WLOCK(object); while (size) { if (n == 0) { n = BLIST_MAX_ALLOC; while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) { n >>= 1; if (n == 0) { swp_pager_meta_free(object, beg, start - beg); VM_OBJECT_WUNLOCK(object); return (-1); } } } swp_pager_meta_build(object, start, blk); --size; ++start; ++blk; --n; } swp_pager_meta_free(object, start, n); VM_OBJECT_WUNLOCK(object); return (0); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build() or if paging is still in * progress on the source. * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { vm_pindex_t i; VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && srcobject->handle != NULL) { vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } /* * transfer source to destination. */ for (i = 0; i < dstobject->size; ++i) { daddr_t dstaddr; /* * Locate (without changing) the swapblk on the destination, * unless it is invalid in which case free it silently, or * if the destination is a resident page, in which case the * source is thrown away. */ dstaddr = swp_pager_meta_ctl(dstobject, i, 0); if (dstaddr == SWAPBLK_NONE) { /* * Destination has no swapblk and is not resident, * copy source. */ daddr_t srcaddr; srcaddr = swp_pager_meta_ctl( srcobject, i + offset, SWM_POP ); if (srcaddr != SWAPBLK_NONE) { /* * swp_pager_meta_build() can sleep. */ vm_object_pip_add(srcobject, 1); VM_OBJECT_WUNLOCK(srcobject); vm_object_pip_add(dstobject, 1); swp_pager_meta_build(dstobject, i, srcaddr); vm_object_pip_wakeup(dstobject); VM_OBJECT_WLOCK(srcobject); vm_object_pip_wakeup(srcobject); } } else { /* * Destination has valid swapblk or it is represented * by a resident page. We destroy the sourceblock. */ swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE); } } /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_ctl(object, pindex, 0); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_ctl(object, pindex - i, 0); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_ctl(object, pindex + i, 0); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page must be locked. */ static void swap_pager_unswapped(vm_page_t m) { swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "m" of length "count". The caller * may optionally specify that additional pages preceding and succeeding * the specified range be paged in. The number of such pages is returned * in the "rbehind" and "rahead" parameters, and they will be in the * inactive queue upon return. * * The pages in "m" must be busied and will remain busied upon return. */ static int swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, j, maxahead, maxbehind, reqcount, shift; reqcount = count; VM_OBJECT_WUNLOCK(object); bp = getpbuf(&nsw_rcount); VM_OBJECT_WLOCK(object); if (!swap_pager_haspage(object, m[0]->pindex, &maxbehind, &maxahead)) { relpbuf(bp, &nsw_rcount); return (VM_PAGER_FAIL); } /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = m[reqcount - 1]->pindex; msucc = TAILQ_NEXT(m[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = m[0]->pindex; mpred = TAILQ_PREV(m[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } /* * Allocate readahead and readbehind pages. */ shift = rbehind != NULL ? *rbehind : 0; if (shift != 0) { for (i = 1; i <= shift; i++) { p = vm_page_alloc(object, m[0]->pindex - i, - VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); + VM_ALLOC_NORMAL); if (p == NULL) { /* Shift allocated pages to the left. */ for (j = 0; j < i - 1; j++) bp->b_pages[j] = bp->b_pages[j + shift - i + 1]; break; } bp->b_pages[shift - i] = p; } shift = i - 1; *rbehind = shift; } for (i = 0; i < reqcount; i++) bp->b_pages[i + shift] = m[i]; if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, - m[reqcount - 1]->pindex + i + 1, - VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); + m[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[shift + reqcount + i] = p; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); for (i = 0; i < count; i++) bp->b_pages[i]->oflags |= VPO_SWAPINPROG; pindex = bp->b_pages[0]->pindex; blk = swp_pager_meta_ctl(object, pindex, 0); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; PCPU_INC(cnt.v_swapin); PCPU_ADD(cnt.v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our m[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); while ((m[0]->oflags & VPO_SWAPINPROG) != 0) { m[0]->oflags |= VPO_SWAPSLEEP; PCPU_INC(cnt.v_intrans); if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (m[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, m, count, rbehind, rahead); VM_OBJECT_WUNLOCK(object); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, m, count, error); VM_OBJECT_WLOCK(object); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whos rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int i, n; boolean_t sync; if (count && m[0]->object != object) { panic("swap_pager_putpages: object mismatch %p/%p", object, m[0]->object ); } /* * Step 1 * * Turn object into OBJT_SWAP * check for bogus sysops * force sync if not pageout process */ if (object->type != OBJT_SWAP) swp_pager_meta_build(object, 0, SWAPBLK_NONE); VM_OBJECT_WUNLOCK(object); n = 0; if (curproc != pageproc) sync = TRUE; else sync = (flags & VM_PAGER_PUT_SYNC) != 0; /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { int j; struct buf *bp; daddr_t blk; /* * Maximum I/O size is limited by a number of factors. */ n = min(BLIST_MAX_ALLOC, count - i); n = min(n, nsw_cluster_max); /* * Get biggest block of swap we can. If we fail, fall * back and try to allocate a smaller block. Don't go * overboard trying to allocate space if it would overly * fragment swap. */ while ( (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE && n > 4 ) { n >>= 1; } if (blk == SWAPBLK_NONE) { for (j = 0; j < n; ++j) rtvals[i+j] = VM_PAGER_FAIL; continue; } /* * All I/O parameters have been satisfied, build the I/O * request and assign the swap space. */ if (sync == TRUE) { bp = getpbuf(&nsw_wcount_sync); } else { bp = getpbuf(&nsw_wcount_async); bp->b_flags = B_ASYNC; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; VM_OBJECT_WLOCK(object); for (j = 0; j < n; ++j) { vm_page_t mreq = m[i+j]; swp_pager_meta_build( mreq->object, mreq->pindex, blk + j ); vm_page_dirty(mreq); mreq->oflags |= VPO_SWAPINPROG; bp->b_pages[j] = mreq; } VM_OBJECT_WUNLOCK(object); bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; PCPU_INC(cnt.v_swapout); PCPU_ADD(cnt.v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ if (sync == FALSE) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * report error */ if (bp->b_ioflags & BIO_ERROR) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->paging_in_progress); } if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ m->valid = 0; } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ vm_page_dirty(m); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); m->valid = VM_PAGE_BITS_ALL; if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). + * A page is only written to swap after a period of + * inactivity. Therefore, we do not expect it to be + * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); + vm_page_lock(m); + vm_page_deactivate_noreuse(m); + vm_page_unlock(m); vm_page_sunbusy(m); - if (vm_page_count_severe()) { - vm_page_lock(m); - vm_page_try_to_cache(m); - vm_page_unlock(m); - } } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ relpbuf( bp, ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : ((bp->b_flags & B_ASYNC) ? &nsw_wcount_async : &nsw_wcount_sync ) ) ); } /* * swap_pager_isswapped: * * Return 1 if at least one page in the given object is paged * out to the given swap device. * * This routine may not sleep. */ int swap_pager_isswapped(vm_object_t object, struct swdevt *sp) { daddr_t index = 0; int bcount; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return (0); mtx_lock(&swhash_mtx); for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { struct swblock *swap; if ((swap = *swp_pager_hash(object, index)) != NULL) { for (i = 0; i < SWAP_META_PAGES; ++i) { if (swp_pager_isondev(swap->swb_pages[i], sp)) { mtx_unlock(&swhash_mtx); return (1); } } } index += SWAP_META_PAGES; } mtx_unlock(&swhash_mtx); return (0); } /* * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in * - * This routine dissociates the page at the given index within a - * swap block from its backing store, paging it in if necessary. - * If the page is paged in, it is placed in the inactive queue, - * since it had its backing store ripped out from under it. - * We also attempt to swap in all other pages in the swap block, - * we only guarantee that the one at the specified index is + * This routine dissociates the page at the given index within an object + * from its backing store, paging it in if it does not reside in memory. + * If the page is paged in, it is marked dirty and placed in the laundry + * queue. The page is marked dirty because it no longer has backing + * store. It is placed in the laundry queue because it has not been + * accessed recently. Otherwise, it would already reside in memory. + * + * We also attempt to swap in all other pages in the swap block. + * However, we only guarantee that the one at the specified index is * paged in. * * XXX - The code to page the whole block in doesn't work, so we * revert to the one-by-one behavior for now. Sigh. */ static inline void swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; vm_object_pip_add(object, 1); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid == VM_PAGE_BITS_ALL) { vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); vm_page_activate(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); return; } if (swap_pager_getpages(object, &m, 1, NULL, NULL) != VM_PAGER_OK) panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ vm_object_pip_wakeup(object); vm_page_dirty(m); vm_page_lock(m); - vm_page_deactivate(m); + vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); vm_pager_page_unswapped(m); } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { struct swblock *swap; vm_object_t locked_obj, object; vm_pindex_t pindex; int i, j, retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; locked_obj = NULL; full_rescan: mtx_lock(&swhash_mtx); for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ restart: for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) { object = swap->swb_object; pindex = swap->swb_index; for (j = 0; j < SWAP_META_PAGES; ++j) { if (!swp_pager_isondev(swap->swb_pages[j], sp)) continue; if (locked_obj != object) { if (locked_obj != NULL) VM_OBJECT_WUNLOCK(locked_obj); locked_obj = object; if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(&swhash_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); mtx_lock(&swhash_mtx); goto restart; } } MPASS(locked_obj == object); mtx_unlock(&swhash_mtx); swp_pager_force_pagein(object, pindex + j); mtx_lock(&swhash_mtx); goto restart; } } } mtx_unlock(&swhash_mtx); if (locked_obj != NULL) { VM_OBJECT_WUNLOCK(locked_obj); locked_obj = NULL; } if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is freed. */ static void swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int exhausted; struct swblock *swap; struct swblock **pswap; int idx; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { object->type = OBJT_SWAP; object->un_pager.swp.swp_bcount = 0; KASSERT(object->handle == NULL, ("default pager with handle")); } /* * Locate hash entry. If not found create, but if we aren't adding * anything just return. If we run out of space in the map we wait * and, since the hash table may have changed, retry. */ retry: mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) == NULL) { int i; if (swapblk == SWAPBLK_NONE) goto done; swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (swap == NULL) { mtx_unlock(&swhash_mtx); VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swap_zone)) { if (atomic_cmpset_int(&exhausted, 0, 1)) printf("swap zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonex", 10); } else VM_WAIT; VM_OBJECT_WLOCK(object); goto retry; } if (atomic_cmpset_int(&exhausted, 1, 0)) printf("swap zone ok\n"); swap->swb_hnext = NULL; swap->swb_object = object; swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK; swap->swb_count = 0; ++object->un_pager.swp.swp_bcount; for (i = 0; i < SWAP_META_PAGES; ++i) swap->swb_pages[i] = SWAPBLK_NONE; } /* * Delete prior contents of metadata */ idx = pindex & SWAP_META_MASK; if (swap->swb_pages[idx] != SWAPBLK_NONE) { swp_pager_freeswapspace(swap->swb_pages[idx], 1); --swap->swb_count; } /* * Enter block into metadata */ swap->swb_pages[idx] = swapblk; if (swapblk != SWAPBLK_NONE) ++swap->swb_count; done: mtx_unlock(&swhash_mtx); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count) { struct swblock **pswap, *swap; vm_pindex_t c; daddr_t v; int n, sidx; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP || count == 0) return; mtx_lock(&swhash_mtx); for (c = 0; c < count;) { pswap = swp_pager_hash(object, index); sidx = index & SWAP_META_MASK; n = SWAP_META_PAGES - sidx; index += n; if ((swap = *pswap) == NULL) { c += n; continue; } for (; c < count && sidx < SWAP_META_PAGES; ++c, ++sidx) { if ((v = swap->swb_pages[sidx]) == SWAPBLK_NONE) continue; swp_pager_freeswapspace(v, 1); swap->swb_pages[sidx] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; c += SWAP_META_PAGES - sidx; break; } } } mtx_unlock(&swhash_mtx); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblock **pswap, *swap; vm_pindex_t index; daddr_t v; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return; index = 0; while (object->un_pager.swp.swp_bcount != 0) { mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, index); if ((swap = *pswap) != NULL) { for (i = 0; i < SWAP_META_PAGES; ++i) { v = swap->swb_pages[i]; if (v != SWAPBLK_NONE) { --swap->swb_count; swp_pager_freeswapspace(v, 1); } } if (swap->swb_count != 0) panic( "swap_pager_meta_free_all: swb_count != 0"); *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } mtx_unlock(&swhash_mtx); index += SWAP_META_PAGES; } } /* * SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data. * * This routine is capable of looking up, popping, or freeing * swapblk assignments in the swap meta data or in the vm_page_t. * The routine typically returns the swapblk being looked-up, or popped, * or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block * was invalid. This routine will automatically free any invalid * meta-data swapblks. * * It is not possible to store invalid swapblks in the swap meta data * (other then a literal 'SWAPBLK_NONE'), so we don't bother checking. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. * * SWM_FREE remove and free swap block from metadata * SWM_POP remove from meta data but do not free.. pop it out */ static daddr_t swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags) { struct swblock **pswap; struct swblock *swap; daddr_t r1; int idx; VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists of the object is OBJT_SWAP * and even then might not be allocated yet. */ if (object->type != OBJT_SWAP) return (SWAPBLK_NONE); r1 = SWAPBLK_NONE; mtx_lock(&swhash_mtx); pswap = swp_pager_hash(object, pindex); if ((swap = *pswap) != NULL) { idx = pindex & SWAP_META_MASK; r1 = swap->swb_pages[idx]; if (r1 != SWAPBLK_NONE) { if (flags & SWM_FREE) { swp_pager_freeswapspace(r1, 1); r1 = SWAPBLK_NONE; } if (flags & (SWM_FREE|SWM_POP)) { swap->swb_pages[idx] = SWAPBLK_NONE; if (--swap->swb_count == 0) { *pswap = swap->swb_hnext; uma_zfree(swap_zone, swap); --object->un_pager.swp.swp_bcount; } } } } mtx_unlock(&swhash_mtx); return (r1); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblock **pswap, *swap; vm_pindex_t i, j, lim; int idx; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP || object->un_pager.swp.swp_bcount == 0) return (object->size); mtx_lock(&swhash_mtx); for (j = pindex; j < object->size; j = lim) { pswap = swp_pager_hash(object, j); lim = rounddown2(j + SWAP_META_PAGES, SWAP_META_PAGES); if (lim > object->size) lim = object->size; if ((swap = *pswap) != NULL) { for (idx = j & SWAP_META_MASK, i = j; i < lim; i++, idx++) { if (swap->swb_pages[idx] != SWAPBLK_NONE) goto found; } } } i = object->size; found: mtx_unlock(&swhash_mtx); return (i); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swap_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message and return -1; otherwise, return 0. */ static int swapon_check_swzone(unsigned long npages) { unsigned long maxpages; /* absolute maximum we can handle assuming 100% efficiency */ maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES; /* recommend using no more than half that amount */ if (npages > maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", npages, maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); return (-1); } return (0); } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; swblk_t dvbase; u_long mblocks; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); /* * If we go beyond this, we get overflows in the radix * tree bitmap code. */ mblocks = 0x40000000 / BLIST_META_RADIX; if (nblks > mblocks) { printf( "WARNING: reducing swap size to maximum of %luMB per unit\n", mblocks / 1024 / 1024 * PAGE_SIZE); nblks = mblocks; } sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_flags = 0; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; sp->sw_blist = blist_create(nblks, M_WAITOK); /* * Do not free the first two block in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, 2, nblks - 2); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks; swap_total += (vm_ooffset_t)nblks * PAGE_SIZE; swapon_check_swzone(swap_total / PAGE_SIZE); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks, dvbase; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp, 0); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_cnt.v_free_count + vm_cnt.v_cache_count + swap_pager_avail < nblks + nswap_lowat) { return (ENOMEM); } /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) { swap_pager_avail -= blist_fill(sp->sw_blist, dvbase, dmmax); } swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp, NULL)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { struct swdevt *sp; *total = 0; *used = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { *total += sp->sw_nblks; *used += sp->sw_used; } mtx_unlock(&sw_dev_mtx); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { if (vn_isdisk(sp->sw_vp, NULL)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; int error; if (arg2 != 1) /* name length */ return (EINVAL); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * vmspace_swap_count() - count the approximate swap usage in pages for a * vmspace. * * The map must be locked. * * Swap usage is determined by taking the proportional swap used by * VM objects backing the VM map. To make up for fractional losses, * if the VM object has any swap use at all the associated map entries * count for at least 1 swap page. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; long count, n; map = &vmspace->vm_map; count = 0; for (cur = map->header.next; cur != &map->header; cur = cur->next) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object = cur->object.vm_object) != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_SWAP && object->un_pager.swp.swp_bcount != 0) { n = (cur->end - cur->start) / PAGE_SIZE; count += object->un_pager.swp.swp_bcount * SWAP_META_PAGES * n / object->size + 1; } VM_OBJECT_WUNLOCK(object); } } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp, 0); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp, 0); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&pbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &pbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&pbuf_mtx); return (0); } Index: stable/11/sys/vm/vm_fault.c =================================================================== --- stable/11/sys/vm/vm_fault.c (revision 318715) +++ stable/11/sys/vm/vm_fault.c (revision 318716) @@ -1,1723 +1,1719 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { vm_page_t m; vm_object_t object; vm_pindex_t pindex; vm_page_t first_m; vm_object_t first_object; vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; struct vnode *vp; }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward); static inline void release_page(struct faultstate *fs) { vm_page_xunbusy(fs->m); vm_page_lock(fs->m); vm_page_deactivate(fs->m); vm_page_unlock(fs->m); fs->m = NULL; } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void unlock_and_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); VM_OBJECT_WUNLOCK(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); vm_page_lock(fs->first_m); vm_page_free(fs->first_m); vm_page_unlock(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot, vm_prot_t fault_type, int fault_flags, bool set_wd) { bool need_dirty; if (((prot & VM_PROT_WRITE) == 0 && (fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_OBJECT_ASSERT_LOCKED(m->object); need_dirty = ((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_WIRE) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0; if (set_wd) vm_object_set_writeable_dirty(m->object); else /* * If two callers of vm_fault_dirty() with set_wd == * FALSE, one for the map entry with MAP_ENTRY_NOSYNC * flag set, other with flag clear, race, it is * possible for the no-NOSYNC thread to see m->dirty * != 0 and not clear VPO_NOSYNC. Take vm_page lock * around manipulation of VPO_NOSYNC and * vm_page_dirty() call, to avoid the race and keep * m->oflags consistent. */ vm_page_lock(m); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) { if (m->dirty == 0) { m->oflags |= VPO_NOSYNC; } } else { m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (need_dirty) vm_page_dirty(m); if (!set_wd) vm_page_unlock(m); if (need_dirty) vm_pager_page_unswapped(m); } static void vm_fault_fill_hold(vm_page_t *m_hold, vm_page_t m) { if (m_hold != NULL) { *m_hold = m; vm_page_lock(m); vm_page_hold(m); vm_page_unlock(m); } } /* * Unlocks fs.first_object and fs.map on success. */ static int vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m; int rv; MPASS(fs->vp == NULL); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL) return (KERN_FAILURE); rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0); if (rv != KERN_SUCCESS) return (rv); vm_fault_fill_hold(m_hold, m); vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false); VM_OBJECT_RUNLOCK(fs->first_object); if (!wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(fs->first_object->paging_in_progress > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(m->valid == VM_PAGE_BITS_ALL); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_page_lock(m); vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } } static int vm_fault_populate(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot, int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold) { vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(fs->first_object->paging_in_progress > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ return (KERN_NOT_RECEIVER); } if (rv != VM_PAGER_OK) return (KERN_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); return (KERN_RESOURCE_SHORTAGE); /* RetryFault */ } /* * The map is unchanged after our last unlock. Process the fault. * * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, true); VM_OBJECT_WUNLOCK(fs->first_object); pmap_enter(fs->map->pmap, fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset, m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); VM_OBJECT_WLOCK(fs->first_object); if (pidx == fs->first_pindex) vm_fault_fill_hold(m_hold, m); vm_page_lock(m); if ((fault_flags & VM_FAULT_WIRE) != 0) { KASSERT(wired, ("VM_FAULT_WIRE && !wired")); vm_page_wire(m); } else { vm_page_activate(m); } vm_page_unlock(m); vm_page_xunbusy(m); } curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } /* * vm_fault: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) { struct thread *td; int result; td = curthread; if ((td->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags, NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND)) ktrfaultend(result); #endif return (result); } int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { - vm_prot_t prot; - vm_object_t next_object; struct faultstate fs; struct vnode *vp; + vm_object_t next_object, retry_object; vm_offset_t e_end, e_start; + vm_pindex_t retry_pindex; + vm_prot_t prot, retry_prot; int ahead, alloc_req, behind, cluster_offset, error, era, faultcount; int locked, nera, result, rv; u_char behavior; boolean_t wired; /* Passed by reference. */ bool dead, growstack, hardfault, is_first_object_locked; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = 0; nera = -1; growstack = true; hardfault = false; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = false; goto RetryFault; } unlock_vp(&fs); return (result); } fs.map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(&fs); fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); else KASSERT((fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are both (1) mapping an existing page from the top- * level object and (2) not having to mark that object as containing * dirty pages. Under these conditions, a read lock on the top-level * object suffices, allowing multiple page faults of a similar type to * run in parallel on the same top-level object. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 && /* avoid calling vm_object_set_writeable_dirty() */ ((prot & VM_PROT_WRITE) == 0 || (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) { VM_OBJECT_RLOCK(fs.first_object); if ((prot & VM_PROT_WRITE) == 0 || (fs.first_object->type != OBJT_VNODE && (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) || (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) { rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type, fault_flags, wired, m_hold); if (rv == KERN_SUCCESS) return (rv); } if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = true; fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is marked for imminent termination, * we retry here, since the collapse pass has raced * with us. Otherwise, if we see terminally dead * object, return fail. */ if ((fs.object->flags & OBJ_DEAD) != 0) { dead = fs.object->type == OBJT_DEAD; unlock_and_deallocate(&fs); if (dead) return (KERN_PROTECTION_FAILURE); pause("vmf_de", 1); goto RetryFault; } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * Wait/Retry if the page is busy. We have to do this * if the page is either exclusive or shared busy * because the vm_pager may be using read busy for * pageouts (and even pageins if it is the vnode * pager), and we could end up trying to pagein and * pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a shared busied page except, perhaps, * to pmap it. */ if (vm_page_busied(fs.m)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_lock(fs.m); vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_xbusy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m)); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. (Default objects are zero-fill, so there is no real * pager for them.) */ if (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } if (fs.object == fs.first_object && (fs.first_object->flags & OBJ_POPULATE) != 0 && fs.first_object->shadow_count == 0) { rv = vm_fault_populate(&fs, vaddr, prot, fault_type, fault_flags, wired, m_hold); switch (rv) { case KERN_SUCCESS: case KERN_FAILURE: unlock_and_deallocate(&fs); return (rv); case KERN_RESOURCE_SHORTAGE: unlock_and_deallocate(&fs); goto RetryFault; case KERN_NOT_RECEIVER: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs.object, atop(vaddr) - fs.pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; - } else if (fs.m->valid == VM_PAGE_BITS_ALL) - break; + } } readrest: /* * At this point, we have either allocated a new page or found * an existing page that is only partially valid. * * We hold a reference on the current object and the page is * exclusive busied. */ /* * If the pager for the current object might have the page, * then determine the number of additional pages to read and * potentially reprioritize previously read pages for earlier * reclamation. These operations should only be performed * once per page fault. Even if the current pager doesn't * have the page, the number of additional pages to read will * apply to subsequent objects in the shadow chain. */ if (fs.object->type != OBJT_DEFAULT && nera == -1 && !P_KILLED(curproc)) { KASSERT(fs.lookup_still_valid, ("map unlocked")); era = fs.entry->read_ahead; behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (vaddr == fs.entry->next_read) vm_fault_dontneed(&fs, vaddr, nera); } else if (vaddr == fs.entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(&fs, vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs.entry->read_ahead = nera; } /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs.entry->start; e_end = fs.entry->end; } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ if (fs.object->type != OBJT_DEFAULT) { /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE && (vp = fs.object->handle) != fs.vp) { /* * Perform an unlock in case the desired vnode * changed while the map was unlocked during a * retry. */ unlock_vp(&fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock * while we have the page exclusive busied or * the object's paging-in-progress count * incremented. Otherwise, we could deadlock. */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (nera > 0) { behind = 0; ahead = nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs.pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - vaddr) - 1); } rv = vm_pager_get_pages(fs.object, &fs.m, 1, &behind, &ahead); if (rv == VM_PAGER_OK) { faultcount = behind + 1 + ahead; hardfault = true; break; /* break to PAGE HAS BEEN FOUND */ } if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { vm_page_lock(fs.m); if (fs.m->wire_count == 0) vm_page_free(fs.m); else vm_page_xunbusy_maybelocked(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return (rv == VM_PAGER_ERROR ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs.object != fs.first_object) { vm_page_lock(fs.m); if (fs.m->wire_count == 0) vm_page_free(fs.m); else vm_page_xunbusy_maybelocked(fs.m); vm_page_unlock(fs.m); fs.m = NULL; } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } vm_page_assert_xbusied(fs.m); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { vm_page_lock(fs.m); vm_page_remove(fs.m); vm_page_unlock(fs.m); vm_page_lock(fs.first_m); vm_page_replace_checked(fs.m, fs.first_object, fs.first_pindex, fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_page_dirty(fs.m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs.m, fs.first_object, fs.object, OFF_TO_IDX( fs.first_object->backing_object_offset)); #endif /* * Removing the page from the backing object * unbusied it. */ vm_page_xbusy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_WIRE) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, PQ_INACTIVE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { - vm_object_t retry_object; - vm_pindex_t retry_pindex; - vm_prot_t retry_prot; - if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = true; if (fs.map->timestamp != fs.map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true); vm_page_assert_xbusied(fs.m); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, prot, fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 && wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fault_flags & VM_FAULT_WIRE) != 0) { KASSERT(wired, ("VM_FAULT_WIRE && !wired")); vm_page_wire(fs.m); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_xunbusy(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object, object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_WLOCKED(object); first_object = fs->first_object; if (first_object != object) { if (!VM_OBJECT_TRYWLOCK(first_object)) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WLOCK(first_object); VM_OBJECT_WLOCK(object); } } /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (m->valid != VM_PAGE_BITS_ALL || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. */ vm_page_lock(m); vm_page_deactivate(m); vm_page_unlock(m); } } } if (first_object != object) VM_OBJECT_WUNLOCK(first_object); } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { VM_OBJECT_RUNLOCK(lobject); break; } if (m->valid == VM_PAGE_BITS_ALL && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); /* * Check for illegal addresses. */ if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. */ for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault_hold(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) { vm_page_lock(*mp); vm_page_unhold(*mp); vm_page_unlock(*mp); } return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, atop(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if (dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object does share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->valid = VM_PAGE_BITS_ALL; dst_m->dirty = VM_PAGE_BITS_ALL; } else { dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; vm_page_xbusy(dst_m); KASSERT(dst_m->valid == VM_PAGE_BITS_ALL, ("invalid dst page %p", dst_m)); } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. */ pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_lock(src_m); vm_page_unwire(src_m, PQ_INACTIVE); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); vm_page_unlock(dst_m); } else { KASSERT(dst_m->wire_count > 0, ("dst_m %p is not wired", dst_m)); } } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } Index: stable/11/sys/vm/vm_map.c =================================================================== --- stable/11/sys/vm/vm_map.c (revision 318715) +++ stable/11/sys/vm/vm_map.c (revision 318716) @@ -1,4337 +1,4335 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a self-adjusting binary search tree of these * entries is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; static uma_zone_t mapzone; static uma_zone_t vmspace_zone; static int vmspace_zinit(void *mem, int size, int flags); static int vm_map_zinit(void *mem, int ize, int flags); static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max); static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); #ifdef INVARIANTS static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); #endif static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow); static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr); #define ENTRY_CHARGED(e) ((e)->cred != NULL || \ ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. */ #define PROC_VMSPACE_LOCK(p) do { } while (0) #define PROC_VMSPACE_UNLOCK(p) do { } while (0) /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, #ifdef INVARIANTS vm_map_zdtor, #else NULL, #endif vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_prealloc(mapzone, MAX_KMAP); kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } static int vmspace_zinit(void *mem, int size, int flags) { struct vmspace *vm; vm = (struct vmspace *)mem; vm->vm_map.pmap = NULL; (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags); PMAP_LOCK_INIT(vmspace_pmap(vm)); return (0); } static int vm_map_zinit(void *mem, int size, int flags) { vm_map_t map; map = (vm_map_t)mem; memset(map, 0, sizeof(*map)); mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); return (0); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); } static void vm_map_zdtor(void *mem, int size, void *arg) { vm_map_t map; map = (vm_map_t)mem; KASSERT(map->nentries == 0, ("map %p nentries == %d on free.", map, map->nentries)); KASSERT(map->size == 0, ("map %p size == %lu on free.", map, (unsigned long)map->size)); } #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. * * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit(). */ struct vmspace * vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); if (pinit == NULL) pinit = &pmap_pinit; if (!pinit(vmspace_pmap(vm))) { uma_zfree(vmspace_zone, vm); return (NULL); } CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); vm->vm_refcnt = 1; vm->vm_shm = NULL; vm->vm_swrss = 0; vm->vm_tsize = 0; vm->vm_dsize = 0; vm->vm_ssize = 0; vm->vm_taddr = 0; vm->vm_daddr = 0; vm->vm_maxsaddr = 0; return (vm); } #ifdef RACCT static void vmspace_container_reset(struct proc *p) { PROC_LOCK(p); racct_set(p, RACCT_DATA, 0); racct_set(p, RACCT_STACK, 0); racct_set(p, RACCT_RSS, 0); racct_set(p, RACCT_MEMLOCK, 0); racct_set(p, RACCT_VMEM, 0); PROC_UNLOCK(p); } #endif static inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset, vm->vm_map.max_offset); pmap_release(vmspace_pmap(vm)); vm->vm_map.pmap = NULL; uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmspace_free() called"); if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; p->p_vmspace = NULL; PROC_VMSPACE_UNLOCK(p); KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } void vmspace_exit(struct thread *td) { int refcnt; struct vmspace *vm; struct proc *p; /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * * The last exiting process to reach this point releases as * much of the environment as it can. vmspace_dofree() is the * slower fallback in case another process had a temporary * reference to the vmspace. */ p = td->td_proc; vm = p->p_vmspace; atomic_add_int(&vmspace0.vm_refcnt, 1); do { refcnt = vm->vm_refcnt; if (refcnt > 1 && p->p_vmspace != &vmspace0) { /* Switch now since other proc might free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); if (refcnt == 1) { if (p->p_vmspace != vm) { /* vmspace not yet freed, switch back */ PROC_VMSPACE_LOCK(p); p->p_vmspace = vm; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } pmap_remove_pages(vmspace_pmap(vm)); /* Switch now since this proc will free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); vmspace_dofree(vm); } #ifdef RACCT if (racct_enable) vmspace_container_reset(p); #endif } /* Acquire reference to vmspace owned by another process. */ struct vmspace * vmspace_acquire_ref(struct proc *p) { struct vmspace *vm; int refcnt; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; if (vm == NULL) { PROC_VMSPACE_UNLOCK(p); return (NULL); } do { refcnt = vm->vm_refcnt; if (refcnt <= 0) { /* Avoid 0->1 transition */ PROC_VMSPACE_UNLOCK(p); return (NULL); } } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1)); if (vm != p->p_vmspace) { PROC_VMSPACE_UNLOCK(p); vmspace_free(vm); return (NULL); } PROC_VMSPACE_UNLOCK(p); return (vm); } /* * Switch between vmspaces in an AIO kernel process. * * The AIO kernel processes switch to and from a user process's * vmspace while performing an I/O operation on behalf of a user * process. The new vmspace is either the vmspace of a user process * obtained from an active AIO request or the initial vmspace of the * AIO kernel process (when it is idling). Because user processes * will block to drain any active AIO requests before proceeding in * exit() or execve(), the vmspace reference count for these vmspaces * can never be 0. This allows for a much simpler implementation than * the loop in vmspace_acquire_ref() above. Similarly, AIO kernel * processes hold an extra reference on their initial vmspace for the * life of the process so that this guarantee is true for any vmspace * passed as 'newvm'. */ void vmspace_switch_aio(struct vmspace *newvm) { struct vmspace *oldvm; /* XXX: Need some way to assert that this is an aio daemon. */ KASSERT(newvm->vm_refcnt > 0, ("vmspace_switch_aio: newvm unreferenced")); oldvm = curproc->p_vmspace; if (oldvm == newvm) return; /* * Point to the new address space and refer to it. */ curproc->p_vmspace = newvm; atomic_add_int(&newvm->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(curthread); /* Remove the daemon's reference to the old address space. */ KASSERT(oldvm->vm_refcnt > 1, ("vmspace_switch_aio: oldvm dropping last reference")); vmspace_free(oldvm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_xlock_(&map->lock, file, line); map->timestamp++; } static void vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry, next; vm_object_t object; td = curthread; entry = td->td_map_def_user; td->td_map_def_user = NULL; while (entry != NULL) { next = entry->next; if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { /* * Decrement the object's writemappings and * possibly the vnode's v_writecount. */ KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with writecount")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for writecount")); vnode_pager_release_writecount(object, entry->start, entry->end); } vm_map_entry_deallocate(entry, FALSE); entry = next; } } void _vm_map_unlock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_xunlock_(&map->lock, file, line); vm_map_process_deferred(); } } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_slock_(&map->lock, file, line); } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); } } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_xlock_(&map->lock, file, line); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_slock_(&map->lock, file, line); return (error == 0); } /* * _vm_map_lock_upgrade: [ internal use only ] * * Tries to upgrade a read (shared) lock on the specified map to a write * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a * non-zero value if the upgrade fails. If the upgrade fails, the map is * returned without a read or write lock held. * * Requires that the map be read locked. */ int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { unsigned int last_timestamp; if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { if (!sx_try_upgrade_(&map->lock, file, line)) { last_timestamp = map->timestamp; sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); /* * If the map's timestamp does not change while the * map is unlocked, then the upgrade succeeds. */ sx_xlock_(&map->lock, file, line); if (last_timestamp != map->timestamp) { sx_xunlock_(&map->lock, file, line); return (1); } } } map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else sx_downgrade_(&map->lock, file, line); } /* * vm_map_locked: * * Returns a non-zero value if the caller holds a write (exclusive) lock * on the specified map and the value "0" otherwise. */ int vm_map_locked(vm_map_t map) { if (map->system_map) return (mtx_owned(&map->system_mtx)); else return (sx_xlocked(&map->lock)); } #ifdef INVARIANTS static void _vm_map_assert_locked(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_assert_(&map->system_mtx, MA_OWNED, file, line); else sx_assert_(&map->lock, SA_XLOCKED, file, line); } #define VM_MAP_ASSERT_LOCKED(map) \ _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) #else #define VM_MAP_ASSERT_LOCKED(map) #endif /* * _vm_map_unlock_and_wait: * * Atomically releases the lock on the specified map and puts the calling * thread to sleep. The calling thread will remain asleep until either * vm_map_wakeup() is performed on the map or the specified timeout is * exceeded. * * WARNING! This function does not perform deferred deallocations of * objects and map entries. Therefore, the calling thread is expected to * reacquire the map lock after reawakening and later perform an ordinary * unlock operation, such as vm_map_unlock(), before completing its * operation on the map. */ int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) { mtx_lock(&map_sleep_mtx); if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else sx_xunlock_(&map->lock, file, line); return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", timo)); } /* * vm_map_wakeup: * * Awaken any threads that have slept on the map using * vm_map_unlock_and_wait(). */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the map unlock * and the msleep() in _vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } void vm_map_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); map->busy++; } void vm_map_unbusy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); KASSERT(map->busy, ("vm_map_unbusy: not busy")); if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); wakeup(&map->busy); } } void vm_map_wait_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); while (map->busy) { vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); if (map->system_map) msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); else sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); } map->timestamp++; } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ vm_map_t vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) { vm_map_t result; result = uma_zalloc(mapzone, M_WAITOK); CTR1(KTR_VM, "vm_map_create: %p", result); _vm_map_init(result, pmap, min, max); return (result); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. */ static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { map->header.next = map->header.prev = &map->header; map->needs_wakeup = FALSE; map->system_map = 0; map->pmap = pmap; map->min_offset = min; map->max_offset = max; map->flags = 0; map->root = NULL; map->timestamp = 0; map->busy = 0; } void vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, pmap, min, max); mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "user map"); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; if (map->system_map) new_entry = uma_zalloc(kmapentzone, M_NOWAIT); else new_entry = uma_zalloc(mapentzone, M_WAITOK); if (new_entry == NULL) panic("vm_map_entry_create: kernel resources exhausted"); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_set_max_free: * * Set the max_free field in a vm_map_entry. */ static inline void vm_map_entry_set_max_free(vm_map_entry_t entry) { entry->max_free = entry->adj_free; if (entry->left != NULL && entry->left->max_free > entry->max_free) entry->max_free = entry->left->max_free; if (entry->right != NULL && entry->right->max_free > entry->max_free) entry->max_free = entry->right->max_free; } /* * vm_map_entry_splay: * * The Sleator and Tarjan top-down splay algorithm with the * following variation. Max_free must be computed bottom-up, so * on the downward pass, maintain the left and right spines in * reverse order. Then, make a second pass up each side to fix * the pointers and compute max_free. The time bound is O(log n) * amortized. * * The new root is the vm_map_entry containing "addr", or else an * adjacent entry (lower or higher) if addr is not in the tree. * * The map must be locked, and leaves it so. * * Returns: the new root. */ static vm_map_entry_t vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root) { vm_map_entry_t llist, rlist; vm_map_entry_t ltree, rtree; vm_map_entry_t y; /* Special case of empty tree. */ if (root == NULL) return (root); /* * Pass One: Splay down the tree until we find addr or a NULL * pointer where addr would go. llist and rlist are the two * sides in reverse order (bottom-up), with llist linked by * the right pointer and rlist linked by the left pointer in * the vm_map_entry. Wait until Pass Two to set max_free on * the two spines. */ llist = NULL; rlist = NULL; for (;;) { /* root is never NULL in here. */ if (addr < root->start) { y = root->left; if (y == NULL) break; if (addr < y->start && y->left != NULL) { /* Rotate right and put y on rlist. */ root->left = y->right; y->right = root; vm_map_entry_set_max_free(root); root = y->left; y->left = rlist; rlist = y; } else { /* Put root on rlist. */ root->left = rlist; rlist = root; root = y; } } else if (addr >= root->end) { y = root->right; if (y == NULL) break; if (addr >= y->end && y->right != NULL) { /* Rotate left and put y on llist. */ root->right = y->left; y->left = root; vm_map_entry_set_max_free(root); root = y->right; y->right = llist; llist = y; } else { /* Put root on llist. */ root->right = llist; llist = root; root = y; } } else break; } /* * Pass Two: Walk back up the two spines, flip the pointers * and set max_free. The subtrees of the root go at the * bottom of llist and rlist. */ ltree = root->left; while (llist != NULL) { y = llist->right; llist->right = ltree; vm_map_entry_set_max_free(llist); ltree = llist; llist = y; } rtree = root->right; while (rlist != NULL) { y = rlist->left; rlist->left = rtree; vm_map_entry_set_max_free(rlist); rtree = rlist; rlist = y; } /* * Final assembly: add ltree and rtree as subtrees of root. */ root->left = ltree; root->right = rtree; vm_map_entry_set_max_free(root); return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t after_where, vm_map_entry_t entry) { CTR4(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, map->nentries, entry, after_where); VM_MAP_ASSERT_LOCKED(map); KASSERT(after_where == &map->header || after_where->end <= entry->start, ("vm_map_entry_link: prev end %jx new start %jx overlap", (uintmax_t)after_where->end, (uintmax_t)entry->start)); KASSERT(after_where->next == &map->header || entry->end <= after_where->next->start, ("vm_map_entry_link: new end %jx next start %jx overlap", (uintmax_t)entry->end, (uintmax_t)after_where->next->start)); map->nentries++; entry->prev = after_where; entry->next = after_where->next; entry->next->prev = entry; after_where->next = entry; if (after_where != &map->header) { if (after_where != map->root) vm_map_entry_splay(after_where->start, map->root); entry->right = after_where->right; entry->left = after_where; after_where->right = NULL; after_where->adj_free = entry->start - after_where->end; vm_map_entry_set_max_free(after_where); } else { entry->right = map->root; entry->left = NULL; } entry->adj_free = (entry->next == &map->header ? map->max_offset : entry->next->start) - entry->end; vm_map_entry_set_max_free(entry); map->root = entry; } static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev, root; VM_MAP_ASSERT_LOCKED(map); if (entry != map->root) vm_map_entry_splay(entry->start, map->root); if (entry->left == NULL) root = entry->right; else { root = vm_map_entry_splay(entry->start, entry->left); root->right = entry->right; root->adj_free = (entry->next == &map->header ? map->max_offset : entry->next->start) - root->end; vm_map_entry_set_max_free(root); } map->root = root; prev = entry->prev; next = entry->next; next->prev = prev; prev->next = next; map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_entry_resize_free: * * Recompute the amount of free space following a vm_map_entry * and propagate that value up the tree. Call this function after * resizing a map entry in-place, that is, without a call to * vm_map_entry_link() or _unlink(). * * The map must be locked, and leaves it so. */ static void vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry) { /* * Using splay trees without parent pointers, propagating * max_free up the tree is done by moving the entry to the * root and making the change there. */ if (entry != map->root) map->root = vm_map_entry_splay(entry->start, map->root); entry->adj_free = (entry->next == &map->header ? map->max_offset : entry->next->start) - entry->end; vm_map_entry_set_max_free(entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur; boolean_t locked; /* * If the map is empty, then the map entry immediately preceding * "address" is the map's header. */ cur = map->root; if (cur == NULL) *entry = &map->header; else if (address >= cur->start && cur->end > address) { *entry = cur; return (TRUE); } else if ((locked = vm_map_locked(map)) || sx_try_upgrade(&map->lock)) { /* * Splay requires a write lock on the map. However, it only * restructures the binary search tree; it does not otherwise * change the map. Thus, the map's timestamp need not change * on a temporary upgrade. */ map->root = cur = vm_map_entry_splay(address, cur); if (!locked) sx_downgrade(&map->lock); /* * If "address" is contained within a map entry, the new root * is that map entry. Otherwise, the new root is a map entry * immediately before or after "address". */ if (address >= cur->start) { *entry = cur; if (cur->end > address) return (TRUE); } else *entry = cur->prev; } else /* * Since the map is only locked for read access, perform a * standard binary search tree lookup for "address". */ for (;;) { if (address < cur->start) { if (cur->left == NULL) { *entry = cur->prev; break; } cur = cur->left; } else if (cur->end > address) { *entry = cur; return (TRUE); } else { if (cur->right == NULL) { *entry = cur; break; } cur = cur->right; } } return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry, temp_entry; struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; VM_MAP_ASSERT_LOCKED(map); KASSERT((object != kmem_object && object != kernel_object) || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kmem or kernel object and COW")); KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, ("vm_map_insert: paradoxical MAP_NOFAULT request")); /* * Check that the start and end points are not bogus. */ if (start < map->min_offset || end > map->max_offset || start >= end) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &temp_entry)) return (KERN_NO_SPACE); prev_entry = temp_entry; /* * Assert that the next entry doesn't overlap the end point. */ if (prev_entry->next != &map->header && prev_entry->next->start < end) return (KERN_NO_SPACE); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) protoeflags |= MAP_ENTRY_NOFAULT; if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (cow & MAP_STACK_GROWS_DOWN) protoeflags |= MAP_ENTRY_GROWS_DOWN; if (cow & MAP_STACK_GROWS_UP) protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_VN_WRITECOUNT) protoeflags |= MAP_ENTRY_VN_WRITECNT; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; cred = NULL; if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT)) goto charged; if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) return (KERN_RESOURCE_SHORTAGE); KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || object->cred == NULL, ("overcommit: vm_map_insert o %p", object)); cred = curthread->td_ucred; } charged: /* Expand the kernel pmap, if necessary. */ if (map == kernel_map && end > kernel_vm_end) pmap_growkernel(end); if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ VM_OBJECT_WLOCK(object); if (object->ref_count > 1 || object->shadow_count != 0) vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); } else if (prev_entry != &map->header && prev_entry->eflags == protoeflags && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 && prev_entry->end == start && prev_entry->wired_count == 0 && (prev_entry->cred == cred || (prev_entry->object.vm_object != NULL && prev_entry->object.vm_object->cred == cred)) && vm_object_coalesce(prev_entry->object.vm_object, prev_entry->offset, (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end), cred != NULL && (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if (prev_entry->inheritance == inheritance && prev_entry->protection == prot && prev_entry->max_protection == max) { map->size += end - prev_entry->end; prev_entry->end = end; vm_map_entry_resize_free(map, prev_entry); vm_map_simplify_entry(map, prev_entry); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); if (cred != NULL && object != NULL && object->cred != NULL && !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { /* Object already accounts for this uid. */ cred = NULL; } } if (cred != NULL) crhold(cred); /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->cred = NULL; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->avail_ssize = 0; new_entry->inheritance = inheritance; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; new_entry->wiring_thread = NULL; new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; new_entry->next_read = start; KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); new_entry->cred = cred; /* * Insert the new entry into the list */ vm_map_entry_link(map, prev_entry, new_entry); map->size += new_entry->end - new_entry->start; /* * Try to coalesce the new entry with both the previous and next * entries in the list. Previously, we only attempted to coalesce * with the previous entry when object is NULL. Here, we handle the * other cases, which are less common. */ vm_map_simplify_entry(map, new_entry); if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * vm_map_findspace: * * Find the first fit (lowest VM address) for "length" free bytes * beginning at address >= start in the given map. * * In a vm_map_entry, "adj_free" is the amount of free space * adjacent (higher address) to this entry, and "max_free" is the * maximum amount of contiguous free space in its subtree. This * allows finding a free region in one path down the tree, so * O(log n) amortized with splay trees. * * The map must be locked, and leaves it so. * * Returns: 0 on success, and starting address in *addr, * 1 if insufficient space. */ int vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, vm_offset_t *addr) /* OUT */ { vm_map_entry_t entry; vm_offset_t st; /* * Request must fit within min/max VM address and must avoid * address wrap. */ if (start < map->min_offset) start = map->min_offset; if (start + length > map->max_offset || start + length < start) return (1); /* Empty tree means wide open address space. */ if (map->root == NULL) { *addr = start; return (0); } /* * After splay, if start comes before root node, then there * must be a gap from start to the root. */ map->root = vm_map_entry_splay(start, map->root); if (start + length <= map->root->start) { *addr = start; return (0); } /* * Root is the last node that might begin its gap before * start, and this is the last comparison where address * wrap might be a problem. */ st = (start > map->root->end) ? start : map->root->end; if (length <= map->root->end + map->root->adj_free - st) { *addr = st; return (0); } /* With max_free, can immediately tell if no solution. */ entry = map->root->right; if (entry == NULL || length > entry->max_free) return (1); /* * Search the right subtree in the order: left subtree, root, * right subtree (first fit). The previous splay implies that * all regions in the right subtree have addresses > start. */ while (entry != NULL) { if (entry->left != NULL && entry->left->max_free >= length) entry = entry->left; else if (entry->adj_free >= length) { *addr = entry->end; return (0); } else entry = entry->right; } /* Can't get here, so panic if we do. */ panic("vm_map_findspace: max_free corrupt"); } int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t end; int result; end = start + length; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if ((cow & MAP_CHECK_EXCL) == 0) vm_map_delete(map, start, end); if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); } else { result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } vm_map_unlock(map); return (result); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t alignment, initial_addr, start; int result; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_find: non-NULL backing object for stack")); if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || (object->flags & OBJ_COLORED) == 0)) find_space = VMFS_ANY_SPACE; if (find_space >> 8 != 0) { KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); alignment = (vm_offset_t)1 << (find_space >> 8); } else alignment = 0; initial_addr = *addr; again: start = initial_addr; vm_map_lock(map); do { if (find_space != VMFS_NO_SPACE) { if (vm_map_findspace(map, start, length, addr) || (max_addr != 0 && *addr + length > max_addr)) { vm_map_unlock(map); if (find_space == VMFS_OPTIMAL_SPACE) { find_space = VMFS_ANY_SPACE; goto again; } return (KERN_NO_SPACE); } switch (find_space) { case VMFS_SUPER_SPACE: case VMFS_OPTIMAL_SPACE: pmap_align_superpage(object, offset, addr, length); break; case VMFS_ANY_SPACE: break; default: if ((*addr & (alignment - 1)) != 0) { *addr &= ~(alignment - 1); *addr += alignment; } break; } start = *addr; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); } else { result = vm_map_insert(map, object, offset, start, start + length, prot, max, cow); } } while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE && find_space != VMFS_ANY_SPACE); vm_map_unlock(map); return (result); } /* * vm_map_simplify_entry: * * Simplify the given map entry by merging with either neighbor. This * routine also has the ability to merge with both neighbors. * * The map must be locked. * * This routine guarantees that the passed entry remains valid (though * possibly extended). When merging, this routine may delete one or * both neighbors. */ void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev; vm_size_t prevsize, esize; if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0) return; prev = entry->prev; if (prev != &map->header) { prevsize = prev->end - prev->start; if ( (prev->end == entry->start) && (prev->object.vm_object == entry->object.vm_object) && (!prev->object.vm_object || (prev->offset + prevsize == entry->offset)) && (prev->eflags == entry->eflags) && (prev->protection == entry->protection) && (prev->max_protection == entry->max_protection) && (prev->inheritance == entry->inheritance) && (prev->wired_count == entry->wired_count) && (prev->cred == entry->cred)) { vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; if (entry->prev != &map->header) vm_map_entry_resize_free(map, entry->prev); /* * If the backing object is a vnode object, * vm_object_deallocate() calls vrele(). * However, vrele() does not lock the vnode * because the vnode has additional * references. Thus, the map lock can be kept * without causing a lock-order reversal with * the vnode lock. * * Since we count the number of virtual page * mappings in object->un_pager.vnp.writemappings, * the writemappings value should not be adjusted * when the entry is disposed of. */ if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); if (prev->cred != NULL) crfree(prev->cred); vm_map_entry_dispose(map, prev); } } next = entry->next; if (next != &map->header) { esize = entry->end - entry->start; if ((entry->end == next->start) && (next->object.vm_object == entry->object.vm_object) && (!entry->object.vm_object || (entry->offset + esize == next->offset)) && (next->eflags == entry->eflags) && (next->protection == entry->protection) && (next->max_protection == entry->max_protection) && (next->inheritance == entry->inheritance) && (next->wired_count == entry->wired_count) && (next->cred == entry->cred)) { vm_map_entry_unlink(map, next); entry->end = next->end; vm_map_entry_resize_free(map, entry); /* * See comment above. */ if (next->object.vm_object) vm_object_deallocate(next->object.vm_object); if (next->cred != NULL) crfree(next->cred); vm_map_entry_dispose(map, next); } } } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->end > start && entry->start < start, ("_vm_map_clip_start: invalid clip of entry %p", entry)); /* * Split off the front portion -- note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ vm_map_simplify_entry(map, entry); /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; if (entry->cred != NULL) { object->cred = entry->cred; object->charge = entry->end - entry->start; entry->cred = NULL; } } else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; if (new_entry->cred != NULL) crhold(entry->cred); vm_map_entry_link(map, entry->prev, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); /* * The object->un_pager.vnp.writemappings for the * object of MAP_ENTRY_VN_WRITECNT type entry shall be * kept as is here. The virtual pages are * re-distributed among the clipped entries, so the sum is * left the same. */ } } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if ((endaddr) < (entry->end)) \ _vm_map_clip_end((map), (entry), (endaddr)); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->start < end && entry->end > end, ("_vm_map_clip_end: invalid clip of entry %p", entry)); /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; if (entry->cred != NULL) { object->cred = entry->cred; object->charge = entry->end - entry->start; entry->cred = NULL; } } else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } /* * Create a new entry and insert it AFTER the specified entry */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); if (new_entry->cred != NULL) crhold(entry->cred); vm_map_entry_link(map, entry, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result = KERN_INVALID_ARGUMENT; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && ((entry->eflags & MAP_ENTRY_COW) == 0) && (entry->object.vm_object == NULL)) { entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } vm_map_unlock(map); return (result); } /* * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload the specified map's pmap with mappings to the specified * object's memory-resident pages. No further physical pages are * allocated, and no further virtual pages are retrieved from secondary * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within - * the specified address range are mapped. Because these mappings are - * being created speculatively, cached pages are not reactivated and - * mapped. + * the specified address range are mapped. */ static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t start; vm_page_t p, p_start; vm_pindex_t mask, psize, threshold, tmpidx; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; VM_OBJECT_RLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { VM_OBJECT_RUNLOCK(object); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_LOCK_DOWNGRADE(object); } psize = atop(size); if (psize + pindex > object->size) { if (object->size < pindex) { VM_OBJECT_RUNLOCK(object); return; } psize = object->size - pindex; } start = 0; p_start = NULL; threshold = MAX_INIT_PT; p = vm_page_find_least(object, pindex); /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && vm_cnt.v_free_count < vm_cnt.v_free_reserved) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; break; } if (p->valid == VM_PAGE_BITS_ALL) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; } /* Jump ahead if a superpage mapping is possible. */ if (p->psind > 0 && ((addr + ptoa(tmpidx)) & (pagesizes[p->psind] - 1)) == 0) { mask = atop(pagesizes[p->psind]) - 1; if (tmpidx + mask < psize && vm_page_ps_is_valid(p)) { p += mask; threshold += mask; } } } else if (p_start != NULL) { pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } if (p_start != NULL) pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); VM_OBJECT_RUNLOCK(object); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { vm_map_entry_t current, entry; vm_object_t obj; struct ucred *cred; vm_prot_t old_prot; if (start == end) return (KERN_SUCCESS); vm_map_lock(map); /* * Ensure that we are not concurrently wiring pages. vm_map_wire() may * need to fault pages into the map and will drop the map lock while * doing so, and the VM object may end up in an inconsistent state if we * update the protection on the map entry in between faults. */ vm_map_wait_busy(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else { entry = entry->next; } /* * Make a first pass to check for protection violations. */ for (current = entry; current != &map->header && current->start < end; current = current->next) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } } /* * Do an accounting pass for private read-only mappings that * now will do cow due to allowed write (e.g. debugger sets * breakpoint on text segment) */ for (current = entry; current != &map->header && current->start < end; current = current->next) { vm_map_clip_end(map, current, end); if (set_max || ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 || ENTRY_CHARGED(current)) { continue; } cred = curthread->td_ucred; obj = current->object.vm_object; if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) { if (!swap_reserve(current->end - current->start)) { vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } crhold(cred); current->cred = cred; continue; } VM_OBJECT_WLOCK(obj); if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { VM_OBJECT_WUNLOCK(obj); continue; } /* * Charge for the whole object allocation now, since * we cannot distinguish between non-charged and * charged clipped mapping of the same object later. */ KASSERT(obj->charge == 0, ("vm_map_protect: object %p overcharged (entry %p)", obj, current)); if (!swap_reserve(ptoa(obj->size))) { VM_OBJECT_WUNLOCK(obj); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } crhold(cred); obj->cred = cred; obj->charge = ptoa(obj->size); VM_OBJECT_WUNLOCK(obj); } /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ for (current = entry; current != &map->header && current->start < end; current = current->next) { old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * For user wired map entries, the normal lazy evaluation of * write access upgrades through soft page faults is * undesirable. Instead, immediately copy any pages that are * copy-on-write and enable write access in the physical map. */ if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 && (current->protection & VM_PROT_WRITE) != 0 && (old_prot & VM_PROT_WRITE) == 0) vm_fault_copy_entry(map, map, current, current, NULL); /* * When restricting access, update the physical map. Worry * about copy-on-write here. */ if ((old_prot & ~current->protection) != 0) { #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(current)); #undef MASK } vm_map_simplify_entry(map, current); } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t current, entry; int modify_map = 0; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: if (start == end) return (KERN_SUCCESS); modify_map = 1; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: if (start == end) return (KERN_SUCCESS); vm_map_lock_read(map); break; default: return (KERN_INVALID_ARGUMENT); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { if (modify_map) vm_map_clip_start(map, entry, start); } else { entry = entry->next; } if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ for (current = entry; (current != &map->header) && (current->start < end); current = current->next ) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; vm_map_clip_end(map, current, end); switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: current->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: current->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: current->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: current->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_simplify_entry(map, current); } vm_map_unlock(map); } else { vm_pindex_t pstart, pend; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ for (current = entry; (current != &map->header) && (current->start < end); current = current->next ) { vm_offset_t useEnd, useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; pstart = OFF_TO_IDX(current->offset); pend = pstart + atop(current->end - current->start); useStart = current->start; useEnd = current->end; if (current->start < start) { pstart += atop(start - current->start); useStart = start; } if (current->end > end) { pend -= atop(current->end - end); useEnd = end; } if (pstart >= pend) continue; /* * Perform the pmap_advise() before clearing * PGA_REFERENCED in vm_page_advise(). Otherwise, a * concurrent pmap operation, such as pmap_remove(), * could clear a reference in the pmap and set * PGA_REFERENCED on the page before the pmap_advise() * had completed. Consequently, the page would appear * referenced based upon an old reference that * occurred before this pmap_advise() ran. */ if (behav == MADV_DONTNEED || behav == MADV_FREE) pmap_advise(map->pmap, useStart, useEnd, behav); vm_object_madvise(current->object.vm_object, pstart, pend, behav); /* * Pre-populate paging structures in the * WILLNEED case. For wired entries, the * paging structures are already populated. */ if (behav == MADV_WILLNEED && current->wired_count == 0) { vm_map_pmap_enter(map, useStart, current->protection, current->object.vm_object, pstart, ptoa(pend - pstart), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vmspace_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: break; default: return (KERN_INVALID_ARGUMENT); } if (start == end) return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->inheritance = new_inheritance; vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t saved_start; unsigned int last_timestamp; int rv; boolean_t need_wakeup, result, user_unwire; if (start == end) return (KERN_SUCCESS); user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry != &map->header && entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user unwiring? */ } vm_map_lock(map); if (last_timestamp+1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * First_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } /* * If system unwiring, require that the entry is system wired. */ if (!user_unwire && vm_map_entry_system_wired_count(entry) == 0) { end = entry->end; rv = KERN_INVALID_ARGUMENT; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_unwire: lookup failed")); } for (entry = first_entry; entry != &map->header && entry->start < end; entry = entry->next) { /* * If VM_MAP_WIRE_HOLESOK was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for draining * MAP_ENTRY_IN_TRANSITION. Moreover, another thread * could be simultaneously wiring this new mapping * entry. Detect these cases and skip any entries * marked as in transition by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, ("vm_map_unwire: !HOLESOK and new/changed entry")); continue; } if (rv == KERN_SUCCESS && (!user_unwire || (entry->eflags & MAP_ENTRY_USER_WIRED))) { if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; if (entry->wired_count == 1) vm_map_entry_unwire(map, entry); else entry->wired_count--; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_unwire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_unwire: alien wire %p", entry)); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_wire_entry_failure: * * Handle a wiring failure on the given entry. * * The map should be locked. */ static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && entry->wired_count == 1, ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); KASSERT(failed_addr < entry->end, ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); /* * If any pages at the start of this entry were successfully wired, * then unwire them. */ if (failed_addr > entry->start) { pmap_unwire(map->pmap, entry->start, failed_addr); vm_object_unwire(entry->object.vm_object, entry->offset, failed_addr - entry->start, PQ_ACTIVE); } /* * Assign an out-of-range value to represent the failure to wire this * entry. */ entry->wired_count = -1; } /* * vm_map_wire: * * Implements both kernel and user wiring. */ int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t faddr, saved_end, saved_start; unsigned int last_timestamp; int rv; boolean_t need_wakeup, result, user_wire; vm_prot_t prot; if (start == end) return (KERN_SUCCESS); prot = 0; if (flags & VM_MAP_WIRE_WRITE) prot |= VM_PROT_WRITE; user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry != &map->header && entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user wiring? */ } vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * first_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || (entry->protection & prot) != prot) { entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; if ((flags & VM_MAP_WIRE_HOLESOK) == 0) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } goto next_entry; } if (entry->wired_count == 0) { entry->wired_count++; saved_start = entry->start; saved_end = entry->end; /* * Release the map lock, relying on the in-transition * mark. Mark the map busy for fork. */ vm_map_busy(map); vm_map_unlock(map); faddr = saved_start; do { /* * Simulate a fault to get the page and enter * it into the physical map. */ if ((rv = vm_fault(map, faddr, VM_PROT_NONE, VM_FAULT_WIRE)) != KERN_SUCCESS) break; } while ((faddr += PAGE_SIZE) < saved_end); vm_map_lock(map); vm_map_unbusy(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ result = vm_map_lookup_entry(map, saved_start, &tmp_entry); KASSERT(result, ("vm_map_wire: lookup failed")); if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; while (entry->end < saved_end) { /* * In case of failure, handle entries * that were not fully wired here; * fully wired entries are handled * later. */ if (rv != KERN_SUCCESS && faddr < entry->end) vm_map_wire_entry_failure(map, entry, faddr); entry = entry->next; } } last_timestamp = map->timestamp; if (rv != KERN_SUCCESS) { vm_map_wire_entry_failure(map, entry, faddr); end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ next_entry: if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_wire: lookup failed")); } for (entry = first_entry; entry != &map->header && entry->start < end; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) goto next_entry_done; /* * If VM_MAP_WIRE_HOLESOK was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for faulting in the * pages or draining MAP_ENTRY_IN_TRANSITION. * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { /* * Undo the wiring. Wiring succeeded on this entry * but failed on a later entry. */ if (entry->wired_count == 1) vm_map_entry_unwire(map, entry); else entry->wired_count--; } next_entry_done: KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_wire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_wire: alien wire %p", entry)); entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WIRE_SKIPPED); entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; boolean_t failed; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = entry->start; end = entry->end; } /* * Make a first pass to check for user-wired memory and holes. */ for (current = entry; current != &map->header && current->start < end; current = current->next) { if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && (current->next == &map->header || current->end != current->next->start)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) pmap_remove(map->pmap, start, end); failed = FALSE; /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current != &map->header && current->start < end;) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } vm_object_reference(object); last_timestamp = map->timestamp; vm_map_unlock_read(map); if (!vm_object_sync(object, offset, size, syncio, invalidate)) failed = TRUE; start += size; vm_object_deallocate(object); vm_map_lock_read(map); if (last_timestamp == map->timestamp || !vm_map_lookup_entry(map, start, ¤t)) current = current->next; } vm_map_unlock_read(map); return (failed ? KERN_FAILURE : KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->wired_count > 0, ("vm_map_entry_unwire: entry %p isn't wired", entry)); pmap_unwire(map->pmap, entry->start, entry->end); vm_object_unwire(entry->object.vm_object, entry->offset, entry->end - entry->start, PQ_ACTIVE); entry->wired_count = 0; } static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) vm_object_deallocate(entry->object.vm_object); uma_zfree(system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, count, size1; vm_size_t size; vm_map_entry_unlink(map, entry); object = entry->object.vm_object; size = entry->end - entry->start; map->size -= size; if (entry->cred != NULL) { swap_release_by_cred(size, entry->cred); crfree(entry->cred); } if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object != NULL)) { KASSERT(entry->cred == NULL || object->cred == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY), ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); count = atop(size); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_WLOCK(object); if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT | OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || object == kernel_object || object == kmem_object)) { vm_object_collapse(object); /* * The option OBJPR_NOTMAPPED can be passed here * because vm_map_delete() already performed * pmap_remove() on the only mapping to this range * of pages. */ vm_object_page_remove(object, offidxstart, offidxend, OBJPR_NOTMAPPED); if (object->type == OBJT_SWAP) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) { size1 = object->size; object->size = offidxstart; if (object->cred != NULL) { size1 -= object->size; KASSERT(object->charge >= ptoa(size1), ("object %p charge < 0", object)); swap_release_by_cred(ptoa(size1), object->cred); object->charge -= ptoa(size1); } } } VM_OBJECT_WUNLOCK(object); } else entry->object.vm_object = NULL; if (map->system_map) vm_map_entry_deallocate(entry, TRUE); else { entry->next = curthread->td_map_def_user; curthread->td_map_def_user = entry; } } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry; vm_map_entry_t first_entry; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { entry = first_entry; vm_map_clip_start(map, entry, start); } /* * Step through all entries in this region */ while ((entry != &map->header) && (entry->start < end)) { vm_map_entry_t next; /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on * user maps. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && vm_map_entry_system_wired_count(entry) != 0)) { unsigned int last_timestamp; vm_offset_t saved_start; vm_map_entry_t tmp_entry; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, 0); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) entry = tmp_entry->next; else { entry = tmp_entry; vm_map_clip_start(map, entry, saved_start); } } continue; } vm_map_clip_end(map, entry, end); next = entry->next; /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) { vm_map_entry_unwire(map, entry); } pmap_remove(map->pmap, entry->start, entry->end); /* * Delete the entry only after removing all pmap * entries pointing to its pages. (Otherwise, its * page frames may be reallocated, and any modify bits * will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { if (entry == &map->header) return (FALSE); /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = entry->next; } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_ooffset_t *fork_charge) { vm_object_t src_object; vm_map_entry_t fake_entry; vm_offset_t size; struct ucred *cred; int charged; VM_MAP_ASSERT_LOCKED(dst_map); if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0 || (src_entry->protection & VM_PROT_WRITE) == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && (src_entry->protection & VM_PROT_WRITE) != 0) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } /* * Make a copy of the object. */ size = src_entry->end - src_entry->start; if ((src_object = src_entry->object.vm_object) != NULL) { VM_OBJECT_WLOCK(src_object); charged = ENTRY_CHARGED(src_entry); if (src_object->handle == NULL && (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP)) { vm_object_collapse(src_object); if ((src_object->flags & (OBJ_NOSPLIT | OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); if (src_entry->cred != NULL && !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { KASSERT(src_object->cred == NULL, ("OVERCOMMIT: vm_map_copy_entry: cred %p", src_object)); src_object->cred = src_entry->cred; src_object->charge = size; } VM_OBJECT_WUNLOCK(src_object); dst_entry->object.vm_object = src_object; if (charged) { cred = curthread->td_ucred; crhold(cred); dst_entry->cred = cred; *fork_charge += size; if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { crhold(cred); src_entry->cred = cred; *fork_charge += size; } } src_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->offset = src_entry->offset; if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) { /* * MAP_ENTRY_VN_WRITECNT cannot * indicate write reference from * src_entry, since the entry is * marked as needs copy. Allocate a * fake entry that is used to * decrement object->un_pager.vnp.writecount * at the appropriate time. Attach * fake_entry to the deferred list. */ fake_entry = vm_map_entry_create(dst_map); fake_entry->eflags = MAP_ENTRY_VN_WRITECNT; src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT; vm_object_reference(src_object); fake_entry->object.vm_object = src_object; fake_entry->start = src_entry->start; fake_entry->end = src_entry->end; fake_entry->next = curthread->td_map_def_user; curthread->td_map_def_user = fake_entry; } } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; if (src_entry->cred != NULL) { dst_entry->cred = curthread->td_ucred; crhold(dst_entry->cred); *fork_charge += size; } } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { /* * We don't want to make writeable wired pages copy-on-write. * Immediately copy these pages into the new map by simulating * page faults. The new pages are pageable. */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, fork_charge); } } /* * vmspace_map_entry_forked: * Update the newly-forked vmspace each time a map entry is inherited * or copied. The values for vm_dsize and vm_tsize are approximate * (and mostly-obsolete ideas in the face of mmap(2) et al.) */ static void vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_map_entry_t entry) { vm_size_t entrysize; vm_offset_t newend; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { vm2->vm_ssize += btoc(entrysize); } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); vm2->vm_dsize += btoc(newend - entry->start); } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); vm2->vm_tsize += btoc(newend - entry->start); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * XXX It might be worth coalescing the entries added to the new vmspace. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) { struct vmspace *vm2; vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; int locked; old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL); if (vm2 == NULL) return (NULL); vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); new_map = &vm2->vm_map; locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) panic("vm_map_fork: encountered a submap"); switch (old_entry->inheritance) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if necessary. */ object = old_entry->object.vm_object; if (object == NULL) { object = vm_object_allocate(OBJT_DEFAULT, atop(old_entry->end - old_entry->start)); old_entry->object.vm_object = object; old_entry->offset = 0; if (old_entry->cred != NULL) { object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, old_entry->end - old_entry->start); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; /* Transfer the second reference too. */ vm_object_reference( old_entry->object.vm_object); /* * As in vm_map_simplify_entry(), the * vnode lock will not be acquired in * this call to vm_object_deallocate(). */ vm_object_deallocate(object); object = old_entry->object.vm_object; } VM_OBJECT_WLOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); if (old_entry->cred != NULL) { KASSERT(object->cred == NULL, ("vmspace_fork both cred")); object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } /* * Assert the correct state of the vnode * v_writecount while the object is locked, to * not relock it later for the assertion * correctness. */ if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT && object->type == OBJT_VNODE) { KASSERT(((struct vnode *)object->handle)-> v_writecount > 0, ("vmspace_fork: v_writecount %p", object)); KASSERT(object->un_pager.vnp.writemappings > 0, ("vmspace_fork: vnp.writecount %p", object)); } VM_OBJECT_WUNLOCK(object); /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) { vnode_pager_update_writecount(object, new_entry->start, new_entry->end); } /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_map->header.prev, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; /* * Copied entry is COW over the old object. */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; vm_map_entry_link(new_map, new_map->header.prev, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry, fork_charge); break; } old_entry = old_entry->next; } /* * Use inlined vm_map_unlock() to postpone handling the deferred * map entries, which cannot be done until both old_map and * new_map locks are released. */ sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); return (vm2); } int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_size_t growsize, init_ssize; rlim_t lmemlim, vmemlim; int rv; growsize = sgrowsiz; init_ssize = (max_ssize < growsize) ? max_ssize : growsize; vm_map_lock(map); lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); vmemlim = lim_cur(curthread, RLIMIT_VMEM); if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) { rv = KERN_NO_SPACE; goto out; } } /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { rv = KERN_NO_SPACE; goto out; } rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, max, cow); out: vm_map_unlock(map); return (rv); } static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, top; vm_size_t init_ssize; int orient, rv; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. * NOTE: We explicitly allow bi-directional stacks. */ orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP); KASSERT(orient != 0, ("No stack grow direction")); if (addrbos < vm_map_min(map) || addrbos > vm_map_max(map) || addrbos + max_ssize < addrbos) return (KERN_NO_SPACE); init_ssize = (max_ssize < growsize) ? max_ssize : growsize; /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) return (KERN_NO_SPACE); /* * If we can't accommodate max_ssize in the current mapping, no go. * However, we need to be aware that subsequent user mappings might * map into the space we have reserved for stack, and currently this * space is not protected. * * Hopefully we will at least detect this condition when we try to * grow the stack. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < addrbos + max_ssize)) return (KERN_NO_SPACE); /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) bot = addrbos + max_ssize - init_ssize; else if (orient == MAP_STACK_GROWS_UP) bot = addrbos; else bot = round_page(addrbos + max_ssize/2 - init_ssize/2); top = bot + init_ssize; rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); /* Now set the avail_ssize amount. */ if (rv == KERN_SUCCESS) { new_entry = prev_entry->next; if (new_entry->end != top || new_entry->start != bot) panic("Bad entry start/end for new stack entry"); new_entry->avail_ssize = max_ssize - init_ssize; KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, ("new entry lacks MAP_ENTRY_GROWS_UP")); } return (rv); } static int stack_guard_page = 0; SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, &stack_guard_page, 0, "Insert stack guard page ahead of the growable segments."); /* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the * desired address is already mapped, or if we successfully grow * the stack. Also returns KERN_SUCCESS if addr is outside the * stack range (this is strange, but preserves compatibility with * the grow function in vm_machdep.c). */ int vm_map_growstack(struct proc *p, vm_offset_t addr) { vm_map_entry_t next_entry, prev_entry; vm_map_entry_t new_entry, stack_entry; struct vmspace *vm = p->p_vmspace; vm_map_t map = &vm->vm_map; vm_offset_t end; vm_size_t growsize; size_t grow_amount, max_grow; rlim_t lmemlim, stacklim, vmemlim; int is_procstack, rv; struct ucred *cred; #ifdef notyet uint64_t limit; #endif #ifdef RACCT int error; #endif lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); stacklim = lim_cur(curthread, RLIMIT_STACK); vmemlim = lim_cur(curthread, RLIMIT_VMEM); Retry: vm_map_lock_read(map); /* If addr is already in the entry range, no need to grow.*/ if (vm_map_lookup_entry(map, addr, &prev_entry)) { vm_map_unlock_read(map); return (KERN_SUCCESS); } next_entry = prev_entry->next; if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) { /* * This entry does not grow upwards. Since the address lies * beyond this entry, the next entry (if one exists) has to * be a downward growable entry. The entry list header is * never a growable entry, so it suffices to check the flags. */ if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) { vm_map_unlock_read(map); return (KERN_SUCCESS); } stack_entry = next_entry; } else { /* * This entry grows upward. If the next entry does not at * least grow downwards, this is the entry we need to grow. * otherwise we have two possible choices and we have to * select one. */ if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) { /* * We have two choices; grow the entry closest to * the address to minimize the amount of growth. */ if (addr - prev_entry->end <= next_entry->start - addr) stack_entry = prev_entry; else stack_entry = next_entry; } else stack_entry = prev_entry; } if (stack_entry == next_entry) { KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo")); KASSERT(addr < stack_entry->start, ("foo")); end = (prev_entry != &map->header) ? prev_entry->end : stack_entry->start - stack_entry->avail_ssize; grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE); max_grow = stack_entry->start - end; } else { KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo")); KASSERT(addr >= stack_entry->end, ("foo")); end = (next_entry != &map->header) ? next_entry->start : stack_entry->end + stack_entry->avail_ssize; grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE); max_grow = end - stack_entry->end; } if (grow_amount > stack_entry->avail_ssize) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } /* * If there is no longer enough space between the entries nogo, and * adjust the available space. Note: this should only happen if the * user has mapped into the stack area after the stack was created, * and is probably an error. * * This also effectively destroys any guard page the user might have * intended by limiting the stack size. */ if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) { if (vm_map_lock_upgrade(map)) goto Retry; stack_entry->avail_ssize = max_grow; vm_map_unlock(map); return (KERN_NO_SPACE); } is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr && addr < (vm_offset_t)p->p_sysent->sv_usrstack) ? 1 : 0; /* * If this is the main process stack, see if we're over the stack * limit. */ if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (is_procstack && racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { PROC_UNLOCK(p); vm_map_unlock_read(map); return (KERN_NO_SPACE); } PROC_UNLOCK(p); } #endif /* Round up the grow amount modulo sgrowsiz */ growsize = sgrowsiz; grow_amount = roundup(grow_amount, growsize); if (grow_amount > stack_entry->avail_ssize) grow_amount = stack_entry->avail_ssize; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } #ifdef notyet PROC_LOCK(p); limit = racct_get_available(p, RACCT_STACK); PROC_UNLOCK(p); if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) grow_amount = limit - ctob(vm->vm_ssize); #endif if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { PROC_UNLOCK(p); vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { PROC_UNLOCK(p); vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif if (vm_map_lock_upgrade(map)) goto Retry; if (stack_entry == next_entry) { /* * Growing downward. */ /* Get the preliminary new entry start value */ addr = stack_entry->start - grow_amount; /* * If this puts us into the previous entry, cut back our * growth to the available space. Also, see the note above. */ if (addr < end) { stack_entry->avail_ssize = max_grow; addr = end; if (stack_guard_page) addr += PAGE_SIZE; } rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start, next_entry->protection, next_entry->max_protection, MAP_STACK_GROWS_DOWN); /* Adjust the available stack space by the amount we grew. */ if (rv == KERN_SUCCESS) { new_entry = prev_entry->next; KASSERT(new_entry == stack_entry->prev, ("foo")); KASSERT(new_entry->end == stack_entry->start, ("foo")); KASSERT(new_entry->start == addr, ("foo")); KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); grow_amount = new_entry->end - new_entry->start; new_entry->avail_ssize = stack_entry->avail_ssize - grow_amount; stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN; } } else { /* * Growing upward. */ addr = stack_entry->end + grow_amount; /* * If this puts us into the next entry, cut back our growth * to the available space. Also, see the note above. */ if (addr > end) { stack_entry->avail_ssize = end - stack_entry->end; addr = end; if (stack_guard_page) addr -= PAGE_SIZE; } grow_amount = addr - stack_entry->end; cred = stack_entry->cred; if (cred == NULL && stack_entry->object.vm_object != NULL) cred = stack_entry->object.vm_object->cred; if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) rv = KERN_NO_SPACE; /* Grow the underlying object if applicable. */ else if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), (vm_size_t)grow_amount, cred != NULL)) { map->size += (addr - stack_entry->end); /* Update the current entry. */ stack_entry->end = addr; stack_entry->avail_ssize -= grow_amount; vm_map_entry_resize_free(map, stack_entry); rv = KERN_SUCCESS; } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); vm_map_unlock(map); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) { vm_map_wire(map, (stack_entry == next_entry) ? addr : addr - grow_amount, (stack_entry == next_entry) ? stack_entry->start : addr, (p->p_flag & P_SYSTEM) ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); } out: #ifdef RACCT if (racct_enable && rv != KERN_SUCCESS) { PROC_LOCK(p); error = racct_set(p, RACCT_VMEM, map->size); KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); if (!old_mlock) { error = racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); } error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); KASSERT(error == 0, ("decreasing RACCT_STACK failed")); PROC_UNLOCK(p); } #endif return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ int vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, ("vmspace_exec recursed")); newvmspace = vmspace_alloc(minuser, maxuser, NULL); if (newvmspace == NULL) return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); curthread->td_pflags |= TDP_EXECVMSPC; return (0); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ int vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; vm_ooffset_t fork_charge; if (oldvmspace->vm_refcnt == 1) return (0); fork_charge = 0; newvmspace = vmspace_fork(oldvmspace, &fork_charge); if (newvmspace == NULL) return (ENOMEM); if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { vmspace_free(newvmspace); return (ENOMEM); } PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); vmspace_free(oldvmspace); return (0); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; vm_object_t eobject; vm_size_t size; struct ucred *cred; RetryLookup:; vm_map_lock_read(map); /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } entry = *out_entry; /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), ("entry %p flags %x", entry, entry->eflags)); if ((fault_typea & VM_PROT_COPY) != 0 && (entry->max_protection & VM_PROT_WRITE) == 0 && (entry->eflags & MAP_ENTRY_COW) == 0) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; size = entry->end - entry->start; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if ((fault_type & VM_PROT_WRITE) != 0 || (fault_typea & VM_PROT_COPY) != 0) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; if (entry->cred == NULL) { /* * The debugger owner is charged for * the memory. */ cred = curthread->td_ucred; crhold(cred); if (!swap_reserve_by_cred(size, cred)) { crfree(cred); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } entry->cred = cred; } vm_object_shadow(&entry->object.vm_object, &entry->offset, size); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; eobject = entry->object.vm_object; if (eobject->cred != NULL) { /* * The object was not shadowed. */ swap_release_by_cred(size, entry->cred); crfree(entry->cred); entry->cred = NULL; } else if (entry->cred != NULL) { VM_OBJECT_WLOCK(eobject); eobject->cred = entry->cred; eobject->charge = size; VM_OBJECT_WUNLOCK(eobject); entry->cred = NULL; } vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, atop(size)); entry->offset = 0; if (entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = size; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_locked: * * Lookup the faulting address. A version of vm_map_lookup that returns * KERN_FAILURE instead of blocking on map lock or memory allocation. */ int vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) return (KERN_INVALID_ADDRESS); entry = *out_entry; /* * Fail if the entry refers to a submap. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) return (KERN_FAILURE); /* * Check whether this task is allowed to have this page. */ prot = entry->protection; fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * Fail if the entry was copy-on-write for a write fault. */ if (fault_type & VM_PROT_WRITE) return (KERN_FAILURE); /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } /* * Fail if an object should be created. */ if (entry->object.vm_object == NULL && !map->system_map) return (KERN_FAILURE); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } #include "opt_ddb.h" #ifdef DDB #include #include static void vm_map_print(vm_map_t map) { vm_map_entry_t entry; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); db_indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { db_iprintf("map entry %p: start=%p, end=%p\n", (void *)entry, (void *)entry->start, (void *)entry->end); { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char)entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); if ((entry->prev == &map->header) || (entry->prev->object.sub_map != entry->object.sub_map)) { db_indent += 2; vm_map_print((vm_map_t)entry->object.sub_map); db_indent -= 2; } } else { if (entry->cred != NULL) db_printf(", ruid %d", entry->cred->cr_ruid); db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->object.vm_object && entry->object.vm_object->cred) db_printf(", obj ruid %d charge %jx", entry->object.vm_object->cred->cr_ruid, (uintmax_t)entry->object.vm_object->charge); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); if ((entry->prev == &map->header) || (entry->prev->object.vm_object != entry->object.vm_object)) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, 0, 0, (char *)0); db_indent -= 2; } } } db_indent -= 2; } DB_SHOW_COMMAND(map, map) { if (!have_addr) { db_printf("usage: show map \n"); return; } vm_map_print((vm_map_t)addr); } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = db_lookup_proc(addr); } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((vm_map_t)&p->p_vmspace->vm_map); } #endif /* DDB */ Index: stable/11/sys/vm/vm_meter.c =================================================================== --- stable/11/sys/vm/vm_meter.c (revision 318715) +++ stable/11/sys/vm/vm_meter.c (revision 318716) @@ -1,304 +1,314 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vmmeter vm_cnt; SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold"); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages"); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock"); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive"); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point"); static int sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 u_int32_t la[4]; if (req->flags & SCTL_MASK32) { la[0] = averunnable.ldavg[0]; la[1] = averunnable.ldavg[1]; la[2] = averunnable.ldavg[2]; la[3] = averunnable.fscale; return SYSCTL_OUT(req, la, sizeof(la)); } else #endif return SYSCTL_OUT(req, &averunnable, sizeof(averunnable)); } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg", "Machine loadaverage history"); /* * This function aims to determine if the object is mapped, * specifically, if it is referenced by a vm_map_entry. Because * objects occasionally acquire transient references that do not * represent a mapping, the method used here is inexact. However, it * has very low overhead and is good enough for the advisory * vm.vmtotal sysctl. */ static bool is_object_active(vm_object_t obj) { return (obj->ref_count > obj->shadow_count); } static int vmtotal(SYSCTL_HANDLER_ARGS) { struct vmtotal total; vm_object_t object; struct proc *p; struct thread *td; bzero(&total, sizeof(total)); /* * Calculate process statistics. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if ((p->p_flag & P_SYSTEM) != 0) continue; PROC_LOCK(p); if (p->p_state != PRS_NEW) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); switch (td->td_state) { case TDS_INHIBITED: if (TD_IS_SWAPPED(td)) total.t_sw++; else if (TD_IS_SLEEPING(td)) { if (td->td_priority <= PZERO) total.t_dw++; else total.t_sl++; if (td->td_wchan == &vm_cnt.v_free_count) total.t_pw++; } break; case TDS_CAN_RUN: total.t_sw++; break; case TDS_RUNQ: case TDS_RUNNING: total.t_rq++; break; default: break; } thread_unlock(td); } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Calculate object memory usage statistics. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { /* * Perform unsynchronized reads on the object. In * this case, the lack of synchronization should not * impair the accuracy of the reported statistics. */ if ((object->flags & OBJ_FICTITIOUS) != 0) { /* * Devices, like /dev/mem, will badly skew our totals. */ continue; } if (object->ref_count == 0) { /* * Also skip unreferenced objects, including * vnodes representing mounted file systems. */ continue; } if (object->ref_count == 1 && (object->flags & OBJ_NOSPLIT) != 0) { /* * Also skip otherwise unreferenced swap * objects backing tmpfs vnodes, and POSIX or * SysV shared memory. */ continue; } total.t_vm += object->size; total.t_rm += object->resident_page_count; if (is_object_active(object)) { total.t_avm += object->size; total.t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ total.t_vmshr += object->size; total.t_rmshr += object->resident_page_count; if (is_object_active(object)) { total.t_avmshr += object->size; total.t_armshr += object->resident_page_count; } } } mtx_unlock(&vm_object_list_mtx); total.t_free = vm_cnt.v_free_count + vm_cnt.v_cache_count; return (sysctl_handle_opaque(oidp, &total, sizeof(total), req)); } /* - * vcnt() - accumulate statistics from all cpus and the global cnt - * structure. + * vm_meter_cnt() - accumulate statistics from all cpus and the global cnt + * structure. * * The vmmeter structure is now per-cpu as well as global. Those * statistics which can be kept on a per-cpu basis (to avoid cache * stalls between cpus) can be moved to the per-cpu vmmeter. Remaining * statistics, such as v_free_reserved, are left in the global * structure. - * - * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) */ -static int -vcnt(SYSCTL_HANDLER_ARGS) +u_int +vm_meter_cnt(size_t offset) { - int count = *(int *)arg1; - int offset = (char *)arg1 - (char *)&vm_cnt; + struct pcpu *pcpu; + u_int count; int i; + count = *(u_int *)((char *)&vm_cnt + offset); CPU_FOREACH(i) { - struct pcpu *pcpu = pcpu_find(i); - count += *(int *)((char *)&pcpu->pc_cnt + offset); + pcpu = pcpu_find(i); + count += *(u_int *)((char *)&pcpu->pc_cnt + offset); } - return (SYSCTL_OUT(req, &count, sizeof(int))); + return (count); } +static int +cnt_sysctl(SYSCTL_HANDLER_ARGS) +{ + u_int count; + + count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt); + return (SYSCTL_OUT(req, &count, sizeof(count))); +} + SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", "System virtual memory statistics"); SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, "VM meter sys stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, "VM meter vm stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); #define VM_STATS(parent, var, descr) \ SYSCTL_PROC(parent, OID_AUTO, var, \ - CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, vcnt, \ - "IU", descr) + CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0, \ + cnt_sysctl, "IU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) VM_STATS_SYS(v_swtch, "Context switches"); VM_STATS_SYS(v_trap, "Traps"); VM_STATS_SYS(v_syscall, "System calls"); VM_STATS_SYS(v_intr, "Device interrupts"); VM_STATS_SYS(v_soft, "Software interrupts"); VM_STATS_VM(v_vm_faults, "Address memory faults"); VM_STATS_VM(v_io_faults, "Page faults requiring I/O"); VM_STATS_VM(v_cow_faults, "Copy-on-write faults"); VM_STATS_VM(v_cow_optim, "Optimized COW faults"); VM_STATS_VM(v_zfod, "Pages zero-filled on demand"); VM_STATS_VM(v_ozfod, "Optimized zero fill pages"); VM_STATS_VM(v_swapin, "Swap pager pageins"); VM_STATS_VM(v_swapout, "Swap pager pageouts"); VM_STATS_VM(v_swappgsin, "Swap pages swapped in"); VM_STATS_VM(v_swappgsout, "Swap pages swapped out"); VM_STATS_VM(v_vnodein, "Vnode pager pageins"); VM_STATS_VM(v_vnodeout, "Vnode pager pageouts"); VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); -VM_STATS_VM(v_reactivated, "Pages reactivated from free list"); +VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon"); +VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_tcached, "Total pages cached"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); VM_STATS_VM(v_tfree, "Total pages freed"); VM_STATS_VM(v_page_size, "Page size in bytes"); VM_STATS_VM(v_page_count, "Total number of pages in system"); VM_STATS_VM(v_free_reserved, "Pages reserved for deadlock"); VM_STATS_VM(v_free_target, "Pages desired free"); VM_STATS_VM(v_free_min, "Minimum low-free-pages threshold"); VM_STATS_VM(v_free_count, "Free pages"); VM_STATS_VM(v_wire_count, "Wired pages"); VM_STATS_VM(v_active_count, "Active pages"); VM_STATS_VM(v_inactive_target, "Desired inactive pages"); VM_STATS_VM(v_inactive_count, "Inactive pages"); +VM_STATS_VM(v_laundry_count, "Pages eligible for laundering"); VM_STATS_VM(v_cache_count, "Pages on cache queue"); VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code"); VM_STATS_VM(v_forks, "Number of fork() calls"); VM_STATS_VM(v_vforks, "Number of vfork() calls"); VM_STATS_VM(v_rforks, "Number of rfork() calls"); VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel"); VM_STATS_VM(v_forkpages, "VM pages affected by fork()"); VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()"); VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()"); VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel"); SYSCTL_INT(_vm_stats_misc, OID_AUTO, zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "Number of zero-ed free pages"); Index: stable/11/sys/vm/vm_mmap.c =================================================================== --- stable/11/sys/vm/vm_mmap.c (revision 318715) +++ stable/11/sys/vm/vm_mmap.c (revision 318716) @@ -1,1569 +1,1566 @@ /*- * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 */ /* * Mapped file (mmap) interface to VM */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif int sys_sbrk(struct thread *td, struct sbrk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif int sys_sstk(struct thread *td, struct sstk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif int ogetpagesize(struct thread *td, struct getpagesize_args *uap) { td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int sys_mmap(struct thread *td, struct mmap_args *uap) { return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } int kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags, int fd, off_t pos) { struct vmspace *vms; struct file *fp; vm_offset_t addr; vm_size_t pageoff; vm_prot_t cap_maxprot; int align, error; cap_rights_t rights; vms = td->td_proc->p_vmspace; fp = NULL; AUDIT_ARG_FD(fd); addr = addr0; /* * Ignore old flags that used to be defined but did not do anything. */ flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); /* * Enforce the constraints. * Mapping of length 0 is only allowed for old binaries. * Anonymous mapping shall specify -1 as filedescriptor and * zero position for new code. Be nice to ancient a.out * binaries and correct pos for anonymous mapping, since old * ld.so sometimes issues anonymous map requests with non-zero * pos. */ if (!SV_CURPROC_FLAG(SV_AOUT)) { if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) return (EINVAL); } else { if ((flags & MAP_ANON) != 0) pos = 0; } if (flags & MAP_STACK) { if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) return (EINVAL); if (prot != PROT_NONE && (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t) round_page(size); /* hi end */ /* Ensure alignment is at least a page and fits in a pointer. */ align = flags & MAP_ALIGNMENT_MASK; if (align != 0 && align != MAP_ALIGNED_SUPER && (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (addr < vm_map_min(&vms->vm_map) || addr + size > vm_map_max(&vms->vm_map)) return (EINVAL); if (addr + size < addr) return (EINVAL); #ifdef MAP_32BIT if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) return (EINVAL); } else if (flags & MAP_32BIT) { /* * For MAP_32BIT, override the hint if it is too high and * do not bother moving the mapping past the heap (since * the heap is usually above 2GB). */ if (addr + size > MAP_32BIT_MAX_ADDR) addr = 0; #endif } else { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)); } if (size == 0) { /* * Return success without mapping anything for old * binaries that request a page-aligned mapping of * length 0. For modern binaries, this function * returns an error earlier. */ error = 0; } else if (flags & MAP_ANON) { /* * Mapping blank space is trivial. * * This relies on VM_PROT_* matching PROT_*. */ error = vm_mmap_object(&vms->vm_map, &addr, size, prot, VM_PROT_ALL, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the * descriptor disappear on us if we block. Check capability * rights, but also return the maximum rights to be combined * with maxprot later. */ cap_rights_init(&rights, CAP_MMAP); if (prot & PROT_READ) cap_rights_set(&rights, CAP_MMAP_R); if ((flags & MAP_SHARED) != 0) { if (prot & PROT_WRITE) cap_rights_set(&rights, CAP_MMAP_W); } if (prot & PROT_EXEC) cap_rights_set(&rights, CAP_MMAP_X); error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); if (error != 0) goto done; if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { error = EINVAL; goto done; } /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, cap_maxprot, flags, pos, td); } if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); done: if (fp) fdrop(fp, td); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) { return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } #endif #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(struct thread *td, struct ommap_args *uap) { static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; int flags, prot; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 prot = cvtbsdprot[uap->prot & 0x7]; #ifdef COMPAT_FREEBSD32 #if defined(__amd64__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && prot != 0) prot |= PROT_EXEC; #endif #endif flags = 0; if (uap->flags & OMAP_ANON) flags |= MAP_ANON; if (uap->flags & OMAP_COPY) flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) flags |= MAP_SHARED; else flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) flags |= MAP_FIXED; return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, uap->fd, uap->pos)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; size_t len; int flags; }; #endif int sys_msync(struct thread *td, struct msync_args *uap) { return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); } int kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) { vm_offset_t addr; vm_size_t pageoff; vm_map_t map; int rv; addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &td->td_proc->p_vmspace->vm_map; /* * Clean the pages and interpret the return value. */ rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: return (ENOMEM); case KERN_INVALID_ARGUMENT: return (EBUSY); case KERN_FAILURE: return (EIO); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif int sys_munmap(struct thread *td, struct munmap_args *uap) { return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); } int kern_munmap(struct thread *td, uintptr_t addr0, size_t size) { #ifdef HWPMC_HOOKS struct pmckern_map_out pkm; vm_map_entry_t entry; bool pmc_handled; #endif vm_offset_t addr; vm_size_t pageoff; vm_map_t map; if (size == 0) return (EINVAL); addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... */ map = &td->td_proc->p_vmspace->vm_map; if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) return (EINVAL); vm_map_lock(map); #ifdef HWPMC_HOOKS pmc_handled = false; if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { pmc_handled = true; /* * Inform hwpmc if the address range being unmapped contains * an executable region. */ pkm.pm_address = (uintptr_t) NULL; if (vm_map_lookup_entry(map, addr, &entry)) { for (; entry != &map->header && entry->start < addr + size; entry = entry->next) { if (vm_map_check_protection(map, entry->start, entry->end, VM_PROT_EXECUTE) == TRUE) { pkm.pm_address = (uintptr_t) addr; pkm.pm_size = (size_t) size; break; } } } } #endif vm_map_delete(map, addr, addr + size); #ifdef HWPMC_HOOKS if (__predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); vm_map_unlock_read(map); } else #endif vm_map_unlock(map); /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ return (0); } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif int sys_mprotect(struct thread *td, struct mprotect_args *uap) { return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); } int kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) { vm_offset_t addr; vm_size_t pageoff; addr = addr0; prot = (prot & VM_PROT_ALL); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); case KERN_RESOURCE_SHORTAGE: return (ENOMEM); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif int sys_minherit(struct thread *td, struct minherit_args *uap) { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr + size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif int sys_madvise(struct thread *td, struct madvise_args *uap) { return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); } int kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) { vm_map_t map; vm_offset_t addr, end, start; int flags; /* * Check for our special case, advising the swap pager we are * "immortal." */ if (behav == MADV_PROTECT) { flags = PPROT_SET; return (kern_procctl(td, P_PID, td->td_proc->p_pid, PROC_SPROTECT, &flags)); } /* * Check for illegal behavior */ if (behav < 0 || behav > MADV_CORE) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; addr = addr0; if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) return (EINVAL); if ((addr + len) < addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page(addr); end = round_page(addr + len); if (vm_map_madvise(map, start, end, behav)) return (EINVAL); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif int sys_mincore(struct thread *td, struct mincore_args *uap) { return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); } int kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; int error = 0; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; vm_object_t object; vm_paddr_t locked_pa; vm_page_t m; vm_pindex_t pindex; int mincoreinfo; unsigned int timestamp; boolean_t locked; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page(addr0); end = addr + (vm_size_t)round_page(len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (ENOMEM); pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) { vm_map_unlock_read(map); return (ENOMEM); } /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for (current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * check for contiguity */ if (current->end < end && (entry->next == &map->header || current->next->start > current->end)) { vm_map_unlock_read(map); return (ENOMEM); } /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ object = NULL; locked_pa = 0; retry: m = NULL; mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); if (locked_pa != 0) { /* * The page is mapped by this process but not * both accessed and modified. It is also * managed. Acquire the object lock so that * other mappings might be examined. */ m = PHYS_TO_VM_PAGE(locked_pa); if (m->object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = m->object; locked = VM_OBJECT_TRYWLOCK(object); vm_page_unlock(m); if (!locked) { VM_OBJECT_WLOCK(object); vm_page_lock(m); goto retry; } } else vm_page_unlock(m); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("mincore: page %p is mapped but invalid", m)); } else if (mincoreinfo == 0) { /* * The page is not mapped by this process. If * the object implements managed pages, then * determine if the page is resident so that * the mappings might be examined. */ if (current->object.vm_object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = current->object.vm_object; VM_OBJECT_WLOCK(object); } if (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP || object->type == OBJT_VNODE) { pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); - if (m == NULL && - vm_page_is_cached(object, pindex)) - mincoreinfo = MINCORE_INCORE; if (m != NULL && m->valid == 0) m = NULL; if (m != NULL) mincoreinfo = MINCORE_INCORE; } } if (m != NULL) { /* Examine other mappings to the page. */ if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty != 0) mincoreinfo |= MINCORE_MODIFIED_OTHER; /* * The first test for PGA_REFERENCED is an * optimization. The second test is * required because a concurrent pmap * operation could clear the last reference * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ if ((m->aflags & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || (m->aflags & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) VM_OBJECT_WUNLOCK(object); /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = atop(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * Pass the page information to the user */ error = subyte(vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = atop(end - first_addr); while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif int sys_mlock(struct thread *td, struct mlock_args *uap) { return (kern_mlock(td->td_proc, td->td_ucred, __DECONST(uintptr_t, uap->addr), uap->len)); } int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) { vm_offset_t addr, end, last, start; vm_size_t npages, size; vm_map_t map; unsigned long nsize; int error; error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); if (error) return (error); addr = addr0; size = len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); map = &proc->p_vmspace->vm_map; PROC_LOCK(proc); nsize = ptoa(npages + pmap_wired_count(map->pmap)); if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef RACCT if (racct_enable) { PROC_LOCK(proc); error = racct_set(proc, RACCT_MEMLOCK, nsize); PROC_UNLOCK(proc); if (error != 0) return (ENOMEM); } #endif error = vm_map_wire(map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(proc); racct_set(proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif int sys_mlockall(struct thread *td, struct mlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MLOCK); if (error) return (error); if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) return (EINVAL); /* * If wiring all pages in the process would cause it to exceed * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { PROC_LOCK(td->td_proc); if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } PROC_UNLOCK(td->td_proc); } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); PROC_UNLOCK(td->td_proc); if (error != 0) return (ENOMEM); } #endif if (uap->how & MCL_FUTURE) { vm_map_lock(map); vm_map_modflags(map, MAP_WIREFUTURE, 0); vm_map_unlock(map); error = 0; } if (uap->how & MCL_CURRENT) { /* * P1003.1-2001 mandates that all currently mapped pages * will be memory resident and locked (wired) upon return * from mlockall(). vm_map_wire() will wire pages, by * calling vm_fault_wire() for each page in the region. */ error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); error = (error == KERN_SUCCESS ? 0 : EAGAIN); } #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlockall_args { register_t dummy; }; #endif int sys_munlockall(struct thread *td, struct munlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); /* Clear the MAP_WIREFUTURE flag from this vm_map. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, 0); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif int sys_munlock(struct thread *td, struct munlock_args *uap) { return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); } int kern_munlock(struct thread *td, uintptr_t addr0, size_t size) { vm_offset_t addr, end, last, start; #ifdef RACCT vm_map_t map; #endif int error; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); addr = addr0; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); map = &td->td_proc->p_vmspace->vm_map; racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap_vnode() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_offset_t foff; struct ucred *cred; int error, flags, locktype; cred = td->td_ucred; if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) locktype = LK_EXCLUSIVE; else locktype = LK_SHARED; if ((error = vget(vp, locktype, td)) != 0) return (error); AUDIT_ARG_VNODE1(vp); foff = *foffp; flags = *flagsp; obj = vp->v_object; if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (obj == NULL) { error = EINVAL; goto done; } if (obj->type == OBJT_VNODE && obj->handle != vp) { vput(vp); vp = (struct vnode *)obj->handle; /* * Bypass filesystems obey the mpsafety of the * underlying fs. Tmpfs never bypasses. */ error = vget(vp, locktype, td); if (error != 0) return (error); } if (locktype == LK_EXCLUSIVE) { *writecounted = TRUE; vnode_pager_update_writecount(obj, 0, objsize); } } else { error = EINVAL; goto done; } if ((error = VOP_GETATTR(vp, &va, cred))) goto done; #ifdef MAC /* This relies on VM_PROT_* matching PROT_*. */ error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); if (error != 0) goto done; #endif if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { if (prot & VM_PROT_WRITE) { error = EPERM; goto done; } *maxprotp &= ~VM_PROT_WRITE; } } /* * If it is a regular file without any references * we do not need to sync it. * Adjust object size to be the size of actual file. */ objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; if (obj->type == OBJT_VNODE) { obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); if (obj == NULL) { error = ENOMEM; goto done; } } else { KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, ("wrong object type")); VM_OBJECT_WLOCK(obj); vm_object_reference_locked(obj); #if VM_NRESERVLEVEL > 0 vm_object_color(obj, 0); #endif VM_OBJECT_WUNLOCK(obj); } *objp = obj; *flagsp = flags; vfs_mark_atime(vp, cred); done: if (error != 0 && *writecounted) { *writecounted = FALSE; vnode_pager_update_writecount(obj, objsize, 0); } vput(vp); return (error); } /* * vm_mmap_cdev() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on cdevs. */ int vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, vm_ooffset_t *foff, vm_object_t *objp) { vm_object_t obj; int error, flags; flags = *flagsp; if (dsw->d_flags & D_MMAP_ANON) { *objp = NULL; *foff = 0; *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; return (0); } /* * cdevs do not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if (flags & (MAP_PRIVATE|MAP_COPY)) return (EINVAL); /* * Force device mappings to be shared. */ flags |= MAP_SHARED; #ifdef MAC_XXX error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); if (error != 0) return (error); #endif /* * First, try d_mmap_single(). If that is not implemented * (returns ENODEV), fall back to using the device pager. * Note that d_mmap_single() must return a reference to the * object (it needs to bump the reference count of the object * it returns somehow). * * XXX assumes VM_PROT_* == PROT_* */ error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); if (error != ENODEV) return (error); obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, td->td_ucred); if (obj == NULL) return (EINVAL); *objp = obj; *flagsp = flags; return (0); } /* * vm_mmap() * * Internal version of mmap used by exec, sys5 shared memory, and * various device drivers. Handle is either a vnode pointer, a * character device, or NULL for MAP_ANON. */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, objtype_t handle_type, void *handle, vm_ooffset_t foff) { vm_object_t object; struct thread *td = curthread; int error; boolean_t writecounted; if (size == 0) return (EINVAL); size = round_page(size); object = NULL; writecounted = FALSE; /* * Lookup/allocate object. */ switch (handle_type) { case OBJT_DEVICE: { struct cdevsw *dsw; struct cdev *cdev; int ref; cdev = handle; dsw = dev_refthread(cdev, &ref); if (dsw == NULL) return (ENXIO); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, dsw, &foff, &object); dev_relthread(cdev, ref); break; } case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, handle, &foff, &object, &writecounted); break; case OBJT_DEFAULT: if (handle == NULL) { error = 0; break; } /* FALLTHROUGH */ default: error = EINVAL; break; } if (error) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0 && object != NULL) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vnode_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } return (error); } /* * Internal version of mmap that maps a specific VM object into an * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. */ int vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, boolean_t writecounted, struct thread *td) { boolean_t fitit; int docow, error, findspace, rv; if (map == &td->td_proc->p_vmspace->vm_map) { PROC_LOCK(td->td_proc); if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + size); if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); PROC_UNLOCK(td->td_proc); return (error); } } PROC_UNLOCK(td->td_proc); } /* * We currently can only deal with page aligned file offsets. * The mmap() system call already enforces this by subtracting * the page offset from the file offset, but checking here * catches errors in device drivers (e.g. d_single_mmap() * callbacks) and other internal mapping requests (such as in * exec). */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; } if (flags & MAP_ANON) { if (object != NULL || foff != 0) return (EINVAL); docow = 0; } else if (flags & MAP_PREFAULT_READ) docow = MAP_PREFAULT; else docow = MAP_PREFAULT_PARTIAL; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; if (writecounted) docow |= MAP_VN_WRITECOUNT; if (flags & MAP_STACK) { if (object != NULL) return (EINVAL); docow |= MAP_STACK_GROWS_DOWN; } if ((flags & MAP_EXCL) != 0) docow |= MAP_CHECK_EXCL; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) findspace = VMFS_SUPER_SPACE; else if ((flags & MAP_ALIGNMENT_MASK) != 0) findspace = VMFS_ALIGNED_SPACE(flags >> MAP_ALIGNMENT_SHIFT); else findspace = VMFS_OPTIMAL_SPACE; rv = vm_map_find(map, object, foff, addr, size, #ifdef MAP_32BIT flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : #endif 0, findspace, prot, maxprot, docow); } else { rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); } if (rv == KERN_SUCCESS) { /* * If the process has requested that all future mappings * be wired, then heed this. */ if (map->flags & MAP_WIREFUTURE) { vm_map_wire(map, *addr, *addr + size, VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); } } return (vm_mmap_to_errno(rv)); } /* * Translate a Mach VM return code to zero on success or the appropriate errno * on failure. */ int vm_mmap_to_errno(int rv) { switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } Index: stable/11/sys/vm/vm_object.c =================================================================== --- stable/11/sys/vm/vm_object.c (revision 318715) +++ stable/11/sys/vm/vm_object.c (revision 318716) @@ -1,2643 +1,2611 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif - KASSERT(vm_object_cache_is_empty(object), - ("object %p has cached pages", - object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; object->rtree.rt_root = 0; - object->rtree.rt_flags = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; - object->cache.rt_root = 0; - object->cache.rt_flags = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif rw_init(&kmem_object->lock, "kmem vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_init(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vn_printf(vp, "vm_object_vndeallocate "); panic("vm_object_vndeallocate: bad object reference count"); } #endif if (!umtx_shm_vnobj_persistent && object->ref_count == 1) umtx_shm_object_terminated(object); /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from * object->handle. */ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } else { vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vdrop(vp); VM_OBJECT_WLOCK(object); object->ref_count--; if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { if (object->ref_count == 0) VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } } } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; struct vnode *vp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD || object->ref_count != 1) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); vdrop(vp); return; } if ((object->flags & OBJ_TMPFS) != 0) VOP_UNSET_TEXT(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_WUNLOCK(robject); object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PDROP | PVM, "objde2", 0); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(object); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); vm_page_lock(p); /* * Optimize the page's removal from the object by resetting * its "object" field. Specifically, if the page is not * wired, then the effect of this assignment is that * vm_page_free()'s call to vm_page_remove() will return * immediately without modifying the page or the object. */ p->object = NULL; if (p->wire_count == 0) { vm_page_free(p); PCPU_INC(cnt.v_pfree); } vm_page_unlock(p); } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif - if (__predict_false(!vm_object_cache_is_empty(object))) - vm_page_cache_free(object, 0, 0); KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advise) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; VM_OBJECT_WLOCK(object); /* * Locate and adjust resident pages */ for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ if (advise == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } else if ((tobject->flags & OBJ_UNMANAGED) != 0) goto unlock_tobject; m = vm_page_lookup(tobject, tpindex); - if (m == NULL && advise == MADV_WILLNEED) { - /* - * If the page is cached, reactivate it. - */ - m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED | - VM_ALLOC_NOBUSY); - } if (m == NULL) { /* * There may be swap even if there is no backing page */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* * next object */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; goto shadowlookup; } else if (m->valid != VM_PAGE_BITS_ALL) goto unlock_tobject; /* * If the page is not in a normal state, skip it. */ vm_page_lock(m); if (m->hold_count != 0 || m->wire_count != 0) { vm_page_unlock(m); goto unlock_tobject; } KASSERT((m->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", m)); if (vm_page_busied(m)) { if (advise == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); } if (object != tobject) VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(tobject); vm_page_busy_sleep(m, "madvpo", false); VM_OBJECT_WLOCK(object); goto relookup; } if (advise == MADV_WILLNEED) { vm_page_activate(m); } else { vm_page_advise(m, advise); } vm_page_unlock(m); if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_lock(m); VM_OBJECT_WUNLOCK(orig_object); vm_page_busy_sleep(m, "spltwt", false); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } - /* vm_page_rename() will handle dirty and cache. */ + /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); VM_WAIT; VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); - - /* - * Transfer any cached pages from orig_object to new_object. - * If swap_pager_copy() found swapped out pages within the - * specified range of orig_object, then it changed - * new_object's type to OBJT_SWAP when it transferred those - * pages to new_object. Otherwise, new_object's type - * should still be OBJT_DEFAULT and orig_object should not - * contain any cached pages within the specified range. - */ - if (__predict_false(!vm_object_cache_is_empty(orig_object))) - vm_page_cache_transfer(orig_object, offidxstart, - new_object); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); if (p != NULL) vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); if (p == NULL) VM_WAIT; else vm_page_busy_sleep(p, "vmocol", false); VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; + /* + * Initial conditions: + * + * We do not want to have to test for the existence of swap + * pages in the backing object. XXX but with the new swapper this + * would be pretty easy to do. + */ if (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); /* * Page is out of the parent object's range, we can * simply destroy it. */ vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped - * through the rename. vm_page_rename() will handle dirty and - * cache. + * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if (object->paging_in_progress != 0 || backing_object->paging_in_progress != 0) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { vm_object_pip_add(object, 1); vm_object_pip_add(backing_object, 1); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); - - /* - * Free any cached pages from backing_object. - */ - if (__predict_false( - !vm_object_cache_is_empty(backing_object))) - vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); vm_object_pip_wakeup(object); object_collapses++; } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) - goto skipmemq; + return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ vm_page_lock(p); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax", true); VM_OBJECT_WLOCK(object); goto again; } if (p->wire_count != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } goto next; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopar", false); VM_OBJECT_WLOCK(object); goto again; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_write(p); if (p->dirty) goto next; } if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); vm_page_free(p); next: vm_page_unlock(p); } vm_object_pip_wakeup(object); -skipmemq: - if (__predict_false(!vm_object_cache_is_empty(object))) - vm_page_cache_free(object, start, end); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(p); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); break; } } /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_TMPFS_NODE) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (depth = 0; depth < locked_depth; depth++) { tobject = object->backing_object; VM_OBJECT_RUNLOCK(object); object = tobject; } } struct vnode * vm_object_vnode(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) return (object->handle); if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) return (object->un_pager.swp.swp_tmpfs); return (NULL); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo.kvo_size = ptoa(obj->size); kvo.kvo_resident = obj->resident_page_count; kvo.kvo_ref_count = obj->ref_count; kvo.kvo_shadow_count = obj->shadow_count; kvo.kvo_memattr = obj->memattr; kvo.kvo_active = 0; kvo.kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ - if (m->queue == PQ_ACTIVE) + if (vm_page_active(m)) kvo.kvo_active++; - else if (m->queue == PQ_INACTIVE) + else if (vm_page_inactive(m)) kvo.kvo_inactive++; } kvo.kvo_vn_fileid = 0; kvo.kvo_vn_fsid = 0; freepath = NULL; fullpath = ""; vp = NULL; switch (obj->type) { case OBJT_DEFAULT: kvo.kvo_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kvo.kvo_type = KVME_TYPE_VNODE; vp = obj->handle; vref(vp); break; case OBJT_SWAP: kvo.kvo_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kvo.kvo_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kvo.kvo_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kvo.kvo_type = KVME_TYPE_DEAD; break; case OBJT_SG: kvo.kvo_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kvo.kvo_type = KVME_TYPE_MGTDEVICE; break; default: kvo.kvo_type = KVME_TYPE_UNKNOWN; break; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo.kvo_vn_fileid = va.va_fileid; kvo.kvo_vn_fsid = va.va_fsid; } vput(vp); } strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo.kvo_path) + 1; kvo.kvo_structsize = roundup(kvo.kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: stable/11/sys/vm/vm_object.h =================================================================== --- stable/11/sys/vm/vm_object.h (revision 318715) +++ stable/11/sys/vm/vm_object.h (revision 318716) @@ -1,350 +1,331 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include #include #include /* * Types defined: * * vm_object_t Virtual memory object. * - * The root of cached pages pool is protected by both the per-object lock - * and the free pages queue mutex. - * On insert in the cache radix trie, the per-object lock is expected - * to be already held and the free pages queue mutex will be - * acquired during the operation too. - * On remove and lookup from the cache radix trie, only the free - * pages queue mutex is expected to be locked. - * These rules allow for reliably checking for the presence of cached - * pages with only the per-object lock held, thereby reducing contention - * for the free pages queue mutex. - * * List of locks * (c) const until freed * (o) per-object lock * (f) free pages queue mutex * */ struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ - struct vm_radix cache; /* (o + f) root of the cache page radix trie */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; vm_ooffset_t writemappings; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; struct cdev_pager_ops *ops; struct cdev *dev; } devp; /* * SG pager * * sgp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) sgp_pglist; } sgp; /* * Swap pager * * swp_tmpfs - back-pointer to the tmpfs vnode, * if any, which uses the vm object * as backing store. The handle * cannot be reused for linking, * because the vnode can be * reclaimed and recreated, making * the handle changed and hash-chain * invalid. * * swp_bcount - number of swap 'swblock' metablocks, each * contains up to 16 swapblk assignments. * see vm/swap_pager.h */ struct { void *swp_tmpfs; int swp_bcount; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; void *umtx_data; }; /* * Flags */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ #define OBJ_POPULATE 0x0004 /* pager implements populate() */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ /* * Helpers to perform conversion between vm_object page indexes and offsets. * IDX_TO_OFF() converts an index into an offset. * OFF_TO_IDX() converts an offset into an index. Since offsets are signed * by default, the sign propagation in OFF_TO_IDX(), when applied to * negative offsets, is intentional and returns a vm_object page index * that cannot be created by a userspace mapping. * UOFF_TO_IDX() treats the offset as an unsigned value and converts it * into an index accordingly. Use it only when the full range of offset * values are allowed. Currently, this only applies to device mappings. * OBJ_MAX_SIZE specifies the maximum page index corresponding to the * maximum unsigned offset. */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #define UOFF_TO_IDX(off) (((vm_pindex_t)(off)) >> PAGE_SHIFT) #define OBJ_MAX_SIZE (UOFF_TO_IDX(UINT64_MAX) + 1) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). */ #define OBJPR_CLEANONLY 0x1 /* Don't remove dirty pages. */ #define OBJPR_NOTMAPPED 0x2 /* Don't unmap pages. */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; extern struct vm_object kmem_object_store; #define kernel_object (&kernel_object_store) #define kmem_object (&kmem_object_store) #define VM_OBJECT_ASSERT_LOCKED(object) \ rw_assert(&(object)->lock, RA_LOCKED) #define VM_OBJECT_ASSERT_RLOCKED(object) \ rw_assert(&(object)->lock, RA_RLOCKED) #define VM_OBJECT_ASSERT_WLOCKED(object) \ rw_assert(&(object)->lock, RA_WLOCKED) #define VM_OBJECT_ASSERT_UNLOCKED(object) \ rw_assert(&(object)->lock, RA_UNLOCKED) #define VM_OBJECT_LOCK_DOWNGRADE(object) \ rw_downgrade(&(object)->lock) #define VM_OBJECT_RLOCK(object) \ rw_rlock(&(object)->lock) #define VM_OBJECT_RUNLOCK(object) \ rw_runlock(&(object)->lock) #define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo)) #define VM_OBJECT_TRYRLOCK(object) \ rw_try_rlock(&(object)->lock) #define VM_OBJECT_TRYWLOCK(object) \ rw_try_wlock(&(object)->lock) #define VM_OBJECT_TRYUPGRADE(object) \ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) #define VM_OBJECT_WOWNED(object) \ rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } /* * Conditionally set the object's color, which (1) enables the allocation * of physical memory reservations for anonymous objects and larger-than- * superpage-sized named objects and (2) determines the first page offset * within the object at which a reservation may be allocated. In other * words, the color determines the alignment of the object with respect * to the largest superpage boundary. When mapping named objects, like * files or POSIX shared memory objects, the color should be set to zero * before a virtual address is selected for the mapping. In contrast, * for anonymous objects, the color may be set after the virtual address * is selected. * * The object must be locked. */ static __inline void vm_object_color(vm_object_t object, u_short color) { if ((object->flags & OBJ_COLORED) == 0) { object->pg_color = color; object->flags |= OBJ_COLORED; } } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); - -static __inline boolean_t -vm_object_cache_is_empty(vm_object_t object) -{ - - return (vm_radix_is_empty(&object->cache)); -} void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); extern int umtx_shm_vnobj_persistent; vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_object_print(long addr, boolean_t have_addr, long count, char *modif); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ Index: stable/11/sys/vm/vm_page.c =================================================================== --- stable/11/sys/vm/vm_page.c (revision 318715) +++ stable/11/sys/vm/vm_page.c (revision 318716) @@ -1,4009 +1,3607 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; vm_page_t vm_page_array; long vm_page_array_size; long first_page; int vm_page_zero_count; static int boot_pages = UMA_BOOT_PAGES; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); /* Is the page daemon waiting for free pages? */ static int vm_pageout_pages_needed; static uma_zone_t fakepg_zone; -static struct vnode *vm_page_alloc_init(vm_page_t m); -static void vm_page_cache_turn_free(vm_page_t m); +static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_free_wakeup(void); static void vm_page_init_fakepg(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); static void vm_page_init_fakepg(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; vm_page_t m; char *next; int ret; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) continue; mtx_lock(&vm_page_queue_free_mtx); ret = vm_phys_unfree_page(m); mtx_unlock(&vm_page_queue_free_mtx); if (ret == TRUE) { TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (bootverbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = + "vm laundry pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) = + &vm_cnt.v_laundry_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } } /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_paddr_t high_avail, low_avail, page_range, size; vm_paddr_t new_end; int i; vm_paddr_t pa; vm_paddr_t last_pa; char *list, *listend; vm_paddr_t end; vm_paddr_t biggestsize; int biggestone; int pages_per_zone; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); /* * Almost all of the pages needed for bootstrapping UMA are used * for zone structures, so if the number of CPUs results in those * structures taking more than one page each, we set aside more pages * in proportion to the zone structure size. */ pages_per_zone = howmany(sizeof(struct uma_zone) + sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE); if (pages_per_zone > 1) { /* Reserve more pages so that we don't run out. */ boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone; } /* * Allocate memory for use when boot strapping the kernel memory * allocator. * * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include the UMA bootstrap pages and vm_page_dump in a crash dump. * When pmap_map() uses the direct map, they are not automatically * included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); #elif defined(VM_PHYSSEG_DENSE) /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end == high_avail) page_range = (high_avail - low_avail) / (PAGE_SIZE + sizeof(struct vm_page)); else page_range = high_avail / PAGE_SIZE - first_page; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ if (high_avail == end) high_avail = new_end; new_end = vm_reserv_startup(&vaddr, new_end, high_avail); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); for (i = 0; i < page_range; i++) vm_page_array[i].order = VM_NFREEORDER; vm_page_array_size = page_range; /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Add every available physical page that is not blacklisted to * the free lists. */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { vm_phys_add_page(pa); pa += PAGE_SIZE; } } TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; bool locked; vm_page_assert_xbusied(m); locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (x != 0 && !locked) vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; if (x != 0 && !locked) vm_page_unlock(m); } if (x != 0) { wakeup(m); if (!locked) vm_page_unlock(m); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. * * If nonshared is true, sleep only if the page is xbusy. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { u_int x; vm_page_assert_locked(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } static void vm_page_xunbusy_locked(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_assert_locked(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* There is a waiter, do wakeup() instead of vm_page_flash(). */ wakeup(m); } void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); /* * Fast path for unbusy. If it succeeds, we know that there * are no waiters, so we do not need a wakeup. */ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) return; lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); vm_page_xunbusy_locked(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx, *new_mtx; mtx = NULL; for (; count != 0; count--) { /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(*ma); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { - /* These assertions refer to this operation by its public name. */ - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_dirty: page in cache!")); + /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * - * Removes the given mem entry from the object/offset-page - * table and the object page list, but do not invalidate/terminate - * the backing store. + * Removes the specified page from its containing object, but does not + * invalidate any backing storage. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; + vm_page_t mrem; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); + mrem = vm_radix_remove(&object->rtree, m->pindex); + KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ - vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page already in object")); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy_maybelocked(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating - * swap. If the page is on the cache, we have to deactivate it - * or vm_page_dirty() will panic. Dirty pages are not allowed - * on the cache. + * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* - * Convert all of the given object's cached pages that have a - * pindex within the given range into free pages. If the value - * zero is given for "end", then the range's upper bound is - * infinity. If the given object is backed by a vnode and it - * transitions from having one or more cached pages to none, the - * vnode's hold count is reduced. - */ -void -vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) -{ - vm_page_t m; - boolean_t empty; - - mtx_lock(&vm_page_queue_free_mtx); - if (__predict_false(vm_radix_is_empty(&object->cache))) { - mtx_unlock(&vm_page_queue_free_mtx); - return; - } - while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) { - if (end != 0 && m->pindex >= end) - break; - vm_radix_remove(&object->cache, m->pindex); - vm_page_cache_turn_free(m); - } - empty = vm_radix_is_empty(&object->cache); - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && empty) - vdrop(object->handle); -} - -/* - * Returns the cached page that is associated with the given - * object and offset. If, however, none exists, returns NULL. - * - * The free page queue must be locked. - */ -static inline vm_page_t -vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - return (vm_radix_lookup(&object->cache, pindex)); -} - -/* - * Remove the given cached page from its containing object's - * collection of cached pages. - * - * The free page queue must be locked. - */ -static void -vm_page_cache_remove(vm_page_t m) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_page_cache_remove: page %p is not cached", m)); - vm_radix_remove(&m->object->cache, m->pindex); - m->object = NULL; - vm_cnt.v_cache_count--; -} - -/* - * Transfer all of the cached pages with offset greater than or - * equal to 'offidxstart' from the original object's cache to the - * new object's cache. However, any cached pages with offset - * greater than or equal to the new object's size are kept in the - * original object. Initially, the new object's cache must be - * empty. Offset 'offidxstart' in the original object must - * correspond to offset zero in the new object. - * - * The new object must be locked. - */ -void -vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, - vm_object_t new_object) -{ - vm_page_t m; - - /* - * Insertion into an object's collection of cached pages - * requires the object to be locked. In contrast, removal does - * not. - */ - VM_OBJECT_ASSERT_WLOCKED(new_object); - KASSERT(vm_radix_is_empty(&new_object->cache), - ("vm_page_cache_transfer: object %p has cached pages", - new_object)); - mtx_lock(&vm_page_queue_free_mtx); - while ((m = vm_radix_lookup_ge(&orig_object->cache, - offidxstart)) != NULL) { - /* - * Transfer all of the pages with offset greater than or - * equal to 'offidxstart' from the original object's - * cache to the new object's cache. - */ - if ((m->pindex - offidxstart) >= new_object->size) - break; - vm_radix_remove(&orig_object->cache, m->pindex); - /* Update the page's object and offset. */ - m->object = new_object; - m->pindex -= offidxstart; - if (vm_radix_insert(&new_object->cache, m)) - vm_page_cache_turn_free(m); - } - mtx_unlock(&vm_page_queue_free_mtx); -} - -/* - * Returns TRUE if a cached page is associated with the given object and - * offset, and FALSE otherwise. - * - * The object must be locked. - */ -boolean_t -vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) -{ - vm_page_t m; - - /* - * Insertion into an object's collection of cached pages requires the - * object to be locked. Therefore, if the object is locked and the - * object's collection is empty, there is no need to acquire the free - * page queues lock in order to prove that the specified page doesn't - * exist. - */ - VM_OBJECT_ASSERT_WLOCKED(object); - if (__predict_true(vm_object_cache_is_empty(object))) - return (FALSE); - mtx_lock(&vm_page_queue_free_mtx); - m = vm_page_cache_lookup(object, pindex); - mtx_unlock(&vm_page_queue_free_mtx); - return (m != NULL); -} - -/* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate - * VM_ALLOC_IFCACHED return page only if it is cached - * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page - * is cached * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { - struct vnode *vp = NULL; - vm_object_t m_object; vm_page_t m, mpred; int flags, req_class; - mpred = 0; /* XXX: pacify gcc */ + mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), - ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, - req)); + ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); + if (__predict_false((req & VM_ALLOC_IFCACHED) != 0)) + return (NULL); + req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc: pindex already allocated")); } /* - * The page allocation request can came from consumers which already - * hold the free page queue mutex, like vm_page_insert() in - * vm_page_cache(). + * Allocate a page if the number of free pages exceeds the minimum + * for the request class. */ - mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); + mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) { /* - * Allocate from the free queue if the number of free pages - * exceeds the minimum for the request class. + * Can we allocate the page from a reservation? */ - if (object != NULL && - (m = vm_page_cache_lookup(object, pindex)) != NULL) { - if ((req & VM_ALLOC_IFNOTCACHED) != 0) { - mtx_unlock(&vm_page_queue_free_mtx); - return (NULL); - } - if (vm_phys_unfree_page(m)) - vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); #if VM_NRESERVLEVEL > 0 - else if (!vm_reserv_reactivate_page(m)) -#else - else -#endif - panic("vm_page_alloc: cache page %p is missing" - " from the free queue", m); - } else if ((req & VM_ALLOC_IFCACHED) != 0) { - mtx_unlock(&vm_page_queue_free_mtx); - return (NULL); -#if VM_NRESERVLEVEL > 0 - } else if (object == NULL || (object->flags & (OBJ_COLORED | + if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = - vm_reserv_alloc_page(object, pindex, mpred)) == NULL) { -#else - } else { + vm_reserv_alloc_page(object, pindex, mpred)) == NULL) #endif + { + /* + * If not, allocate it from the free page queues. + */ m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 if (m == NULL && vm_reserv_reclaim_inactive()) { m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } else { /* * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); - KASSERT(m->queue == PQ_NONE, - ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); - KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); - KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); - KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m)); - KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); - KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, - ("vm_page_alloc: page %p has unexpected memattr %d", m, - pmap_page_get_memattr(m))); - if ((m->flags & PG_CACHED) != 0) { - KASSERT((m->flags & PG_ZERO) == 0, - ("vm_page_alloc: cached page %p is PG_ZERO", m)); - KASSERT(m->valid != 0, - ("vm_page_alloc: cached page %p is invalid", m)); - if (m->object == object && m->pindex == pindex) - vm_cnt.v_reactivated++; - else - m->valid = 0; - m_object = m->object; - vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && - vm_object_cache_is_empty(m_object)) - vp = m_object->handle; - } else { - KASSERT(m->valid == 0, - ("vm_page_alloc: free page %p is valid", m)); - vm_phys_freecnt_adj(m, -1); - if ((m->flags & PG_ZERO) != 0) - vm_page_zero_count--; - } + vm_phys_freecnt_adj(m, -1); + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); + vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { - /* See the comment below about hold count. */ - if (vp != NULL) - vdrop(vp); pagedaemon_wakeup(); if (req & VM_ALLOC_WIRED) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } - m->object = NULL; + KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; - vm_page_free(m); + /* Don't change PG_ZERO. */ + vm_page_free_toq(m); return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; /* - * The following call to vdrop() must come after the above call - * to vm_page_insert() in case both affect the same object and - * vnode. Otherwise, the affected vnode's hold count could - * temporarily become zero. - */ - if (vp != NULL) - vdrop(vp); - - /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } -static void -vm_page_alloc_contig_vdrop(struct spglist *lst) -{ - - while (!SLIST_EMPTY(lst)) { - vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv); - SLIST_REMOVE_HEAD(lst, plinks.s.ss); - } -} - /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * + * The specified object may not contain fictitious pages. + * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { - struct vnode *drop; - struct spglist deferred_vdrop_list; - vm_page_t m, m_tmp, m_ret; - u_int flags; + vm_page_t m, m_ret, mpred; + u_int busy_lock, flags, oflags; int req_class; + mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), - ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, + ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); - KASSERT(object->type == OBJT_PHYS, - ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", + KASSERT((object->flags & OBJ_FICTITIOUS) == 0, + ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; - SLIST_INIT(&deferred_vdrop_list); + if (object != NULL) { + mpred = vm_radix_lookup_le(&object->rtree, pindex); + KASSERT(mpred == NULL || mpred->pindex != pindex, + ("vm_page_alloc_contig: pindex already allocated")); + } + + /* + * Can we allocate the pages without the number of free pages falling + * below the lower bound for the allocation class? + */ mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages)) { + /* + * Can we allocate the pages from a reservation? + */ #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, - low, high, alignment, boundary)) == NULL) + low, high, alignment, boundary, mpred)) == NULL) #endif + /* + * If not, allocate them from the free page queues. + */ m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } if (m_ret != NULL) - for (m = m_ret; m < &m_ret[npages]; m++) { - drop = vm_page_alloc_init(m); - if (drop != NULL) { - /* - * Enqueue the vnode for deferred vdrop(). - */ - m->plinks.s.pv = drop; - SLIST_INSERT_HEAD(&deferred_vdrop_list, m, - plinks.s.ss); - } - } + vm_phys_freecnt_adj(m_ret, -npages); else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, boundary)) goto retry; #endif } + for (m = m_ret; m < &m_ret[npages]; m++) + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); + for (m = m_ret; m < &m_ret[npages]; m++) + vm_page_alloc_check(m); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; + oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? + VPO_UNMANAGED : 0; + busy_lock = VPB_UNBUSIED; + if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) + busy_lock = VPB_SINGLE_EXCLUSIVER; + if ((req & VM_ALLOC_SBUSY) != 0) + busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; - m->busy_lock = VPB_UNBUSIED; - if (object != NULL) { - if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) - m->busy_lock = VPB_SINGLE_EXCLUSIVER; - if ((req & VM_ALLOC_SBUSY) != 0) - m->busy_lock = VPB_SHARERS_WORD(1); - } + m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; - /* Unmanaged pages don't use "act_count". */ - m->oflags = VPO_UNMANAGED; + m->act_count = 0; + m->oflags = oflags; if (object != NULL) { - if (vm_page_insert(m, object, pindex)) { - vm_page_alloc_contig_vdrop( - &deferred_vdrop_list); - if (vm_paging_needed()) - pagedaemon_wakeup(); + if (vm_page_insert_after(m, object, pindex, mpred)) { + pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0) - atomic_subtract_int(&vm_cnt.v_wire_count, - npages); - for (m_tmp = m, m = m_ret; - m < &m_ret[npages]; m++) { - if ((req & VM_ALLOC_WIRED) != 0) + atomic_subtract_int( + &vm_cnt.v_wire_count, npages); + KASSERT(m->object == NULL, + ("page %p has object", m)); + mpred = m; + for (m = m_ret; m < &m_ret[npages]; m++) { + if (m <= mpred && + (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; - if (m >= m_tmp) { - m->object = NULL; - m->oflags |= VPO_UNMANAGED; - } + m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; - vm_page_free(m); + /* Don't change PG_ZERO. */ + vm_page_free_toq(m); } return (NULL); } + mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } - vm_page_alloc_contig_vdrop(&deferred_vdrop_list); if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* - * Initialize a page that has been freshly dequeued from a freelist. - * The caller has to drop the vnode returned, if it is not NULL. - * - * This function may only be used to initialize unmanaged pages. - * - * To be called with vm_page_queue_free_mtx held. + * Check a page that has been freshly dequeued from a freelist. */ -static struct vnode * -vm_page_alloc_init(vm_page_t m) +static void +vm_page_alloc_check(vm_page_t m) { - struct vnode *drop; - vm_object_t m_object; + KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE, - ("vm_page_alloc_init: page %p has unexpected queue %d", - m, m->queue)); - KASSERT(m->wire_count == 0, - ("vm_page_alloc_init: page %p is wired", m)); - KASSERT(m->hold_count == 0, - ("vm_page_alloc_init: page %p is held", m)); - KASSERT(!vm_page_busied(m), - ("vm_page_alloc_init: page %p is busy", m)); - KASSERT(m->dirty == 0, - ("vm_page_alloc_init: page %p is dirty", m)); + ("page %p has unexpected queue %d", m, m->queue)); + KASSERT(m->wire_count == 0, ("page %p is wired", m)); + KASSERT(m->hold_count == 0, ("page %p is held", m)); + KASSERT(!vm_page_busied(m), ("page %p is busy", m)); + KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, - ("vm_page_alloc_init: page %p has unexpected memattr %d", + ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - drop = NULL; - if ((m->flags & PG_CACHED) != 0) { - KASSERT((m->flags & PG_ZERO) == 0, - ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); - m->valid = 0; - m_object = m->object; - vm_page_cache_remove(m); - if (m_object->type == OBJT_VNODE && - vm_object_cache_is_empty(m_object)) - drop = m_object->handle; - } else { - KASSERT(m->valid == 0, - ("vm_page_alloc_init: free page %p is valid", m)); - vm_phys_freecnt_adj(m, -1); - if ((m->flags & PG_ZERO) != 0) - vm_page_zero_count--; - } - return (drop); + KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_freelist(int flind, int req) { - struct vnode *drop; vm_page_t m; u_int flags; int req_class; req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Do not allocate reserved pages unless the req has asked for it. */ - mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); + mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } if (m == NULL) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } - drop = vm_page_alloc_init(m); + vm_phys_freecnt_adj(m, -1); + if ((m->flags & PG_ZERO) != 0) + vm_page_zero_count--; mtx_unlock(&vm_page_queue_free_mtx); + vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; - if (drop != NULL) - vdrop(drop); if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx, *new_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } m_inc = 1; retry: if (m->wire_count != 0 || m->hold_count != 0) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { run_ext = 0; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); - /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && - object->type != OBJT_VNODE) + object->type != OBJT_VNODE) { run_ext = 0; - else if ((m->flags & PG_CACHED) != 0 || - m != vm_page_lookup(object, m->pindex)) { - /* - * The page is cached or recently converted - * from cached to free. - */ #if VM_NRESERVLEVEL > 0 - if (level >= 0) { - /* - * The page is reserved. Extend the - * current run by one page. - */ - run_ext = 1; - } else -#endif - if ((order = m->order) < VM_NFREEORDER) { - /* - * The page is enqueued in the - * physical memory allocator's cache/ - * free page queues. Moreover, it is - * the first page in a power-of-two- - * sized run of contiguous cache/free - * pages. Add these pages to the end - * of the current run, and jump - * ahead. - */ - run_ext = 1 << order; - m_inc = 1 << order; - } else - run_ext = 0; -#if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && m->queue != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; unlock: VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In - * other words, it is still cached or free. Extend - * the current run by one page. + * other words, it is still free. Extend the current + * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory - * allocator's cache/free page queues. Moreover, it - * is the first page in a power-of-two-sized run of - * contiguous cache/free pages. Add these pages to - * the end of the current run, and jump ahead. + * allocator's free page queues. Moreover, it is the + * first page in a power-of-two-sized run of + * contiguous free pages. Add these pages to the end + * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's - * cache/free page queues. However, it is not the - * first page in a run of contiguous cache/free pages. - * (This case rarely occurs because the scan is - * performed in ascending order.) (2) It is not - * reserved, and it is transitioning from free to - * allocated. (Conversely, the transition from - * allocated to free for managed pages is blocked by - * the page lock.) (3) It is allocated but not - * contained by an object and not wired, e.g., - * allocated by Xen's balloon driver. + * free page queues. However, it is not the first + * page in a run of contiguous free pages. (This case + * rarely occurs because the scan is performed in + * ascending order.) (2) It is not reserved, and it is + * transitioning from free to allocated. (Conversely, + * the transition from allocated to free for managed + * pages is blocked by the page lock.) (3) It is + * allocated but not contained by an object and not + * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct mtx *m_mtx, *new_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(m); if (m_mtx != new_mtx) { if (m_mtx != NULL) mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } retry: if (m->wire_count != 0 || m->hold_count != 0) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } else if (m->wire_count != 0 || m->hold_count != 0) { error = EBUSY; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); - /* Don't care: PG_NODUMP, PG_WINATCFLS, PG_ZERO. */ + /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; - else if ((m->flags & PG_CACHED) != 0 || - m != vm_page_lookup(object, m->pindex)) { - /* - * The page is cached or recently converted - * from cached to free. - */ - VM_OBJECT_WUNLOCK(object); - goto cached; - } else if (object->memattr != VM_MEMATTR_DEFAULT) + else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (m->queue != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } KASSERT(m_new->wire_count == 0, ("page %p is wired", m)); /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0) pmap_remove_all(m); m_new->aflags = m->aflags; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_remque(m); vm_page_replace_checked(m_new, object, m->pindex, m); m->valid = 0; vm_page_undirty(m); /* * The new page must be deactivated * before the object is unlocked. */ new_mtx = vm_page_lockptr(m_new); if (m_mtx != new_mtx) { mtx_unlock(m_mtx); m_mtx = new_mtx; mtx_lock(m_mtx); } vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_remque(m); vm_page_remove(m); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } SLIST_INSERT_HEAD(&free, m, plinks.s.ss); } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { -cached: mtx_lock(&vm_page_queue_free_mtx); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory - * allocator's cache/free page queues. - * Moreover, it is the first page in a power- - * of-two-sized run of contiguous cache/free - * pages. Jump ahead to the last page within - * that run, and continue from there. + * allocator's free page queues. Moreover, it + * is the first page in a power-of-two-sized + * run of contiguous free pages. Jump ahead + * to the last page within that run, and + * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif mtx_unlock(&vm_page_queue_free_mtx); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { mtx_lock(&vm_page_queue_free_mtx); do { SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (true) #endif vm_phys_free_pages(m, 0); } while ((m = SLIST_FIRST(&free)) != NULL); vm_page_zero_idle_wakeup(); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may - * fail due to a shortage of cache/free pages. When reclamation fails, - * callers are expected to perform VM_WAIT before retrying a failed - * allocation operation, e.g., vm_page_alloc_contig(). + * fail due to a shortage of free pages. When reclamation fails, callers + * are expected to perform VM_WAIT before retrying a failed allocation + * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* - * Return if the number of cached and free pages cannot satisfy the - * requested allocation. + * Return if the number of free pages cannot satisfy the requested + * allocation. */ count = vm_cnt.v_free_count + vm_cnt.v_cache_count; if (count < npages + vm_cnt.v_free_reserved || (count < npages + vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } /* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ void vm_wait(void) { mtx_lock(&vm_page_queue_free_mtx); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (__predict_false(pageproc == NULL)) panic("vm_wait in early boot"); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { mtx_lock(&vm_page_queue_free_mtx); if (!vm_pageout_wanted) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } vm_pages_needed = true; msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, "pfault", 0); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { - return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); + if (vm_page_in_laundry(m)) + return (&vm_dom[0].vmd_pagequeues[m->queue]); + else + return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(queue < PQ_COUNT, ("vm_page_enqueue: invalid queue %u request for page %p", queue, m)); - pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; + if (queue == PQ_LAUNDRY) + pq = &vm_dom[0].vmd_pagequeues[queue]; + else + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * - * Helper routine for vm_page_free_toq() and vm_page_cache(). This - * routine is called when a page has been added to the cache or free - * queues. + * Helper routine for vm_page_free_toq(). This routine is called + * when a page is added to the free queues. * * The page queues must be locked. */ static inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && vm_cnt.v_cache_count + vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } } /* - * Turn a cached page into a free page, by changing its attributes. - * Keep the statistics up-to-date. - * - * The free page queue must be locked. - */ -static void -vm_page_cache_turn_free(vm_page_t m) -{ - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - - m->object = NULL; - m->valid = 0; - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_page_cache_turn_free: page %p is not cached", m)); - m->flags &= ~PG_CACHED; - vm_cnt.v_cache_count--; - vm_phys_freecnt_adj(m, 1); -} - -/* * vm_page_free_toq: * * Returns the given page to the free list, * disassociating it with any VM object. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_free_toq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); PCPU_INC(cnt.v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* * Unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_page_remque(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; } else { /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* - * Insert the page into the physical memory allocator's - * cache/free page queues. + * Insert the page into the physical memory allocator's free + * page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (TRUE) #endif vm_phys_free_pages(m, 0); if ((m->flags & PG_ZERO) != 0) ++vm_page_zero_count; else vm_page_zero_idle_wakeup(); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ boolean_t vm_page_unwire(vm_page_t m, uint8_t queue) { KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (FALSE); } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); if ((m->oflags & VPO_UNMANAGED) == 0 && - m->object != NULL && queue != PQ_NONE) { - if (queue == PQ_INACTIVE) - m->flags &= ~PG_WINATCFLS; + m->object != NULL && queue != PQ_NONE) vm_page_enqueue(queue, m); - } return (TRUE); } else return (FALSE); } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * - * Many pages placed on the inactive queue should actually go - * into the cache, but it is difficult to figure out which. What - * we do instead, if the inactive target is well met, is to put - * clean pages at the head of the inactive queue instead of the tail. - * This will cause them to be moved to the cache more quickly and - * if not actively re-referenced, reclaimed more quickly. If we just - * stick these pages at the end of the inactive queue, heavy filesystem - * meta-data accesses can cause an unnecessary paging load on memory bound - * processes. This optimization causes one-time-use metadata to be - * reused more quickly. - * - * Normally noreuse is FALSE, resulting in LRU operation. noreuse is set - * to TRUE if we want this page to be 'as if it were placed in the cache', - * except without unmapping it from the process address space. In - * practice this is implemented by inserting the page at the head of the + * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive + * queue. However, setting "noreuse" to TRUE will accelerate the specified + * page's reclamation, but it will not unmap the page from any address space. + * This is implemented by inserting the page near the head of the inactive * queue, using a marker page to guide FIFO insertion ordering. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, boolean_t noreuse) { struct vm_pagequeue *pq; int queue; vm_page_assert_locked(m); /* * Ignore if the page is already inactive, unless it is unlikely to be * reactivated. */ if ((queue = m->queue) == PQ_INACTIVE && !noreuse) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; /* Avoid multiple acquisitions of the inactive queue lock. */ if (queue == PQ_INACTIVE) { vm_pagequeue_lock(pq); vm_page_dequeue_locked(m); } else { if (queue != PQ_NONE) vm_page_dequeue(m); - m->flags &= ~PG_WINATCFLS; vm_pagequeue_lock(pq); } m->queue = PQ_INACTIVE; if (noreuse) TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, FALSE); } /* * Move the specified page to the inactive queue with the expectation * that it is unlikely to be reused. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { _vm_page_deactivate(m, TRUE); } /* - * vm_page_try_to_cache: + * vm_page_launder * - * Returns 0 on failure, 1 on success + * Put a page in the laundry. */ -int -vm_page_try_to_cache(vm_page_t m) +void +vm_page_launder(vm_page_t m) { + int queue; - vm_page_lock_assert(m, MA_OWNED); - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (m->dirty || m->hold_count || m->wire_count || - (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) - return (0); - pmap_remove_all(m); - if (m->dirty) - return (0); - vm_page_cache(m); - return (1); + vm_page_assert_locked(m); + if ((queue = m->queue) != PQ_LAUNDRY) { + if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { + if (queue != PQ_NONE) + vm_page_dequeue(m); + vm_page_enqueue(PQ_LAUNDRY, m); + } else + KASSERT(queue == PQ_NONE, + ("wired page %p is queued", m)); + } } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); if (m->object != NULL) VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_free(m); return (1); } /* - * vm_page_cache - * - * Put the specified page onto the page cache queue (if appropriate). - * - * The object and page must be locked. - */ -void -vm_page_cache(vm_page_t m) -{ - vm_object_t object; - boolean_t cache_was_empty; - - vm_page_lock_assert(m, MA_OWNED); - object = m->object; - VM_OBJECT_ASSERT_WLOCKED(object); - if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) || - m->hold_count || m->wire_count) - panic("vm_page_cache: attempting to cache busy page"); - KASSERT(!pmap_page_is_mapped(m), - ("vm_page_cache: page %p is mapped", m)); - KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m)); - if (m->valid == 0 || object->type == OBJT_DEFAULT || - (object->type == OBJT_SWAP && - !vm_pager_has_page(object, m->pindex, NULL, NULL))) { - /* - * Hypothesis: A cache-eligible page belonging to a - * default object or swap object but without a backing - * store must be zero filled. - */ - vm_page_free(m); - return; - } - KASSERT((m->flags & PG_CACHED) == 0, - ("vm_page_cache: page %p is already cached", m)); - - /* - * Remove the page from the paging queues. - */ - vm_page_remque(m); - - /* - * Remove the page from the object's collection of resident - * pages. - */ - vm_radix_remove(&object->rtree, m->pindex); - TAILQ_REMOVE(&object->memq, m, listq); - object->resident_page_count--; - - /* - * Restore the default memory attribute to the page. - */ - if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) - pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); - - /* - * Insert the page into the object's collection of cached pages - * and the physical memory allocator's cache/free page queues. - */ - m->flags &= ~PG_ZERO; - mtx_lock(&vm_page_queue_free_mtx); - cache_was_empty = vm_radix_is_empty(&object->cache); - if (vm_radix_insert(&object->cache, m)) { - mtx_unlock(&vm_page_queue_free_mtx); - if (object->type == OBJT_VNODE && - object->resident_page_count == 0) - vdrop(object->handle); - m->object = NULL; - vm_page_free(m); - return; - } - - /* - * The above call to vm_radix_insert() could reclaim the one pre- - * existing cached page from this object, resulting in a call to - * vdrop(). - */ - if (!cache_was_empty) - cache_was_empty = vm_radix_is_singleton(&object->cache); - - m->flags |= PG_CACHED; - vm_cnt.v_cache_count++; - PCPU_INC(cnt.v_tcached); -#if VM_NRESERVLEVEL > 0 - if (!vm_reserv_free_page(m)) { -#else - if (TRUE) { -#endif - vm_phys_free_pages(m, 0); - } - vm_page_free_wakeup(); - mtx_unlock(&vm_page_queue_free_mtx); - - /* - * Increment the vnode's hold count if this is the object's only - * cached page. Decrement the vnode's hold count if this was - * the object's only resident page. - */ - if (object->type == OBJT_VNODE) { - if (cache_was_empty && object->resident_page_count != 0) - vhold(object->handle); - else if (!cache_was_empty && object->resident_page_count == 0) - vdrop(object->handle); - } -} - -/* * vm_page_advise * * Deactivate or do nothing, as appropriate. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed - * up by the system. However, such pages are often reused - * quickly by malloc() so we do not do anything that would - * cause a page fault if we can help it. - * - * Specifically, we do not try to actually free the page now - * nor do we try to put it in the cache (which would cause a - * page fault on reuse). - * - * But we do make the page as freeable as we can without - * actually taking the step of unmapping it. + * without first paging it out. MADV_FREE pages are often + * quickly reused by malloc(3), so we do not do anything that + * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) return; /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that - * the page will be reused quickly. Dirty pages are given a chance to - * cycle once through the inactive queue before becoming eligible for - * laundering. + * the page will be reused quickly. Dirty pages not already in the + * laundry are moved there. */ - _vm_page_deactivate(m, m->dirty == 0); + if (m->dirty == 0) + vm_page_deactivate_noreuse(m); + else + vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, allocflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; - } else if (m->valid != 0) - return (m); + } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * vm_page_ps_is_valid: * * Returns TRUE if the entire (super)page is valid and FALSE otherwise. */ boolean_t vm_page_ps_is_valid(vm_page_t m) { int i, npages; VM_OBJECT_ASSERT_LOCKED(m->object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { if (m[i].valid != VM_PAGE_BITS_ALL) return (FALSE); } return (TRUE); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); + db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d pq_cache %d\n", vm_cnt.v_free_count, vm_cnt.v_cache_count); for (dom = 0; dom < vm_ndomains; dom++) { - db_printf("dom %d page_cnt %d free %d pq_act %d pq_inact %d\n", + db_printf( + "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, - vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt); + vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, + vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: stable/11/sys/vm/vm_page.h =================================================================== --- stable/11/sys/vm/vm_page.h (revision 318715) +++ stable/11/sys/vm/vm_page.h (revision 318716) @@ -1,702 +1,717 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several collections: * * A radix tree used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * In general, operations on this structure's mutable fields are * synchronized using either one of or a combination of the lock on the * object that the page belongs to (O), the pool lock for the page (P), * or the lock for either the free or paging queue (Q). If a field is * annotated below with two of these locks, then holding either lock is * sufficient for read access, but both locks are required for write * access. * * In contrast, the synchronization of accesses to the page's * dirty field is machine dependent (M). In the * machine-independent layer, the lock on the object that the * page belongs to must be held in order to operate on the field. * However, the pmap layer is permitted to set all bits within * the field without holding that lock. If the underlying * architecture does not support atomic read-modify-write * operations on the field's type, then the machine-independent * layer uses a 32-bit atomic on the aligned 32-bit word that * contains the dirty field. In the machine-independent layer, * the implementation of read-modify-write operations on the * field is encapsulated in vm_page_clear_dirty_mask(). */ #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xffu typedef uint8_t vm_page_bits_t; #elif PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffffu typedef uint16_t vm_page_bits_t; #elif PAGE_SIZE == 16384 #define VM_PAGE_BITS_ALL 0xffffffffu typedef uint32_t vm_page_bits_t; #elif PAGE_SIZE == 32768 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu typedef uint64_t vm_page_bits_t; #endif struct vm_page { union { TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */ struct { SLIST_ENTRY(vm_page) ss; /* private slists */ void *pv; } s; struct { u_long p; u_long v; } memguard; } plinks; TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page */ struct md_page md; /* machine dependent stuff */ u_int wire_count; /* wired down maps refs (P) */ volatile u_int busy_lock; /* busy owners lock */ uint16_t hold_count; /* page hold count (P) */ uint16_t flags; /* page PG_* flags (P) */ uint8_t aflags; /* access is atomic */ uint8_t oflags; /* page VPO_* flags (O) */ uint8_t queue; /* page queue index (P,Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; uint8_t order; /* index of the buddy queue */ uint8_t pool; u_char act_count; /* page usage count (P) */ /* NOTE that these must support one bit per DEV_BSIZE in a page */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ vm_page_bits_t valid; /* map of valid DEV_BSIZE chunks (O) */ vm_page_bits_t dirty; /* map of dirty DEV_BSIZE chunks (M) */ }; /* * Page flags stored in oflags: * * Access to these page flags is synchronized by the lock on the object * containing the page (O). * * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG) * indicates that the page is not under PV management but * otherwise should be treated as a normal page. Pages not * under PV management cannot be paged out via the * object/vm_page_t because there is no knowledge of their pte * mappings, and such pages are also not on any PQ queue. * */ #define VPO_UNUSED01 0x01 /* --available-- */ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ #define VPO_NOSYNC 0x10 /* do not collect for syncer */ /* * Busy page implementation details. * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation, * even if the support for owner identity is removed because of size * constraints. Checks on lock recursion are then not possible, while the * lock assertions effectiveness is someway reduced. */ #define VPB_BIT_SHARED 0x01 #define VPB_BIT_EXCLUSIVE 0x02 #define VPB_BIT_WAITERS 0x04 #define VPB_BIT_FLAGMASK \ (VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS) #define VPB_SHARERS_SHIFT 3 #define VPB_SHARERS(x) \ (((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT) #define VPB_SHARERS_WORD(x) ((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED) #define VPB_ONE_SHARER (1 << VPB_SHARERS_SHIFT) #define VPB_SINGLE_EXCLUSIVER VPB_BIT_EXCLUSIVE #define VPB_UNBUSIED VPB_SHARERS_WORD(0) #define PQ_NONE 255 #define PQ_INACTIVE 0 #define PQ_ACTIVE 1 -#define PQ_COUNT 2 +#define PQ_LAUNDRY 2 +#define PQ_COUNT 3 TAILQ_HEAD(pglist, vm_page); SLIST_HEAD(spglist, vm_page); struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; int pq_cnt; u_int * const pq_vcnt; const char * const pq_name; } __aligned(CACHE_LINE_SIZE); struct vm_domain { struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; u_int vmd_page_count; u_int vmd_free_count; long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; int vmd_oom_seq; int vmd_last_active_scan; + struct vm_page vmd_laundry_marker; struct vm_page vmd_marker; /* marker for pagedaemon private use */ struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */ }; extern struct vm_domain vm_dom[MAXMEMDOM]; #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) +#define vm_pagequeue_lockptr(pq) (&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) #ifdef _KERNEL static __inline void vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) { #ifdef notyet vm_pagequeue_assert_locked(pq); #endif pq->pq_cnt += addend; atomic_add_int(pq->pq_vcnt, addend); } #define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) #define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) #endif /* _KERNEL */ extern struct mtx_padalign vm_page_queue_free_mtx; extern struct mtx_padalign pa_lock[]; #if defined(__arm__) #define PDRSHIFT PDR_SHIFT #elif !defined(PDRSHIFT) #define PDRSHIFT 21 #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define PA_LOCKPTR(pa) ((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT])) #define PA_LOCKOBJPTR(pa) ((struct lock_object *)PA_LOCKPTR((pa))) #define PA_LOCK(pa) mtx_lock(PA_LOCKPTR(pa)) #define PA_TRYLOCK(pa) mtx_trylock(PA_LOCKPTR(pa)) #define PA_UNLOCK(pa) mtx_unlock(PA_LOCKPTR(pa)) #define PA_UNLOCK_COND(pa) \ do { \ if ((pa) != 0) { \ PA_UNLOCK((pa)); \ (pa) = 0; \ } \ } while (0) #define PA_LOCK_ASSERT(pa, a) mtx_assert(PA_LOCKPTR(pa), (a)) #ifdef KLD_MODULE #define vm_page_lock(m) vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_unlock(m) vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE) #define vm_page_trylock(m) vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE) #else /* !KLD_MODULE */ #define vm_page_lockptr(m) (PA_LOCKPTR(VM_PAGE_TO_PHYS((m)))) #define vm_page_lock(m) mtx_lock(vm_page_lockptr((m))) #define vm_page_unlock(m) mtx_unlock(vm_page_lockptr((m))) #define vm_page_trylock(m) mtx_trylock(vm_page_lockptr((m))) #endif #if defined(INVARIANTS) #define vm_page_assert_locked(m) \ vm_page_assert_locked_KBI((m), __FILE__, __LINE__) #define vm_page_lock_assert(m, a) \ vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__) #else #define vm_page_assert_locked(m) #define vm_page_lock_assert(m, a) #endif /* * The vm_page's aflags are updated using atomic operations. To set or clear * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear() * must be used. Neither these flags nor these functions are part of the KBI. * * PGA_REFERENCED may be cleared only if the page is locked. It is set by * both the MI and MD VM layers. However, kernel loadable modules should not * directly set this flag. They should call vm_page_reference() instead. * * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter(). * When it does so, the object must be locked, or the page must be * exclusive busied. The MI VM layer must never access this flag * directly. Instead, it should call pmap_page_is_write_mapped(). * * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has * at least one executable mapping. It is not consumed by the MI VM layer. */ #define PGA_WRITEABLE 0x01 /* page may be mapped writeable */ #define PGA_REFERENCED 0x02 /* page has been referenced */ #define PGA_EXECUTABLE 0x04 /* page may be mapped executable */ /* * Page flags. If changed at any other time than page allocation or * freeing, the modification must be protected by the vm_page lock. */ -#define PG_CACHED 0x0001 /* page is cached */ #define PG_FICTITIOUS 0x0004 /* physical page doesn't exist */ #define PG_ZERO 0x0008 /* page is zeroed */ #define PG_MARKER 0x0010 /* special queue marker page */ -#define PG_WINATCFLS 0x0040 /* flush dirty page on inactive q */ #define PG_NODUMP 0x0080 /* don't include this page in a dump */ #define PG_UNHOLDFREE 0x0100 /* delayed free of a held page */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #ifdef _KERNEL #include #include /* * Each pageable resident page falls into one of four lists: * * free * Available for allocation now. * - * cache - * Almost available for allocation. Still associated with - * an object, but clean and immediately freeable. - * - * The following lists are LRU sorted: - * * inactive * Low activity, candidates for reclamation. + * This list is approximately LRU ordered. + * + * laundry * This is the list of pages that should be * paged out next. * * active - * Pages that are "active" i.e. they have been + * Pages that are "active", i.e., they have been * recently referenced. * */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern long vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) /* * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory * page to which the given physical address belongs. The correct vm_page_t * object is returned for addresses that are not page-aligned. */ vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa); /* * Page allocation parameters for vm_page for the functions * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and * vm_page_alloc_freelist(). Some functions support only a subset * of the flags, and ignore others, see the flags legend. * * Bits 0 - 1 define class. * Bits 2 - 15 dedicated for flags. * Legend: * (a) - vm_page_alloc() supports the flag. * (c) - vm_page_alloc_contig() supports the flag. * (f) - vm_page_alloc_freelist() supports the flag. * (g) - vm_page_grab() supports the flag. * Bits above 15 define the count of additional pages that the caller * intends to allocate. */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 #define VM_ALLOC_WIRED 0x0020 /* (acfg) Allocate non pageable page */ #define VM_ALLOC_ZERO 0x0040 /* (acfg) Try to obtain a zeroed page */ #define VM_ALLOC_NOOBJ 0x0100 /* (acg) No associated object */ #define VM_ALLOC_NOBUSY 0x0200 /* (acg) Do not busy the page */ -#define VM_ALLOC_IFCACHED 0x0400 /* (ag) Fail if page is not cached */ -#define VM_ALLOC_IFNOTCACHED 0x0800 /* (ag) Fail if page is cached */ +#define VM_ALLOC_IFCACHED 0x0400 +#define VM_ALLOC_IFNOTCACHED 0x0800 #define VM_ALLOC_IGN_SBUSY 0x1000 /* (g) Ignore shared busy flag */ #define VM_ALLOC_NODUMP 0x2000 /* (ag) don't include in dump */ #define VM_ALLOC_SBUSY 0x4000 /* (acg) Shared busy the page */ #define VM_ALLOC_NOWAIT 0x8000 /* (g) Do not sleep, return NULL */ #define VM_ALLOC_COUNT_SHIFT 16 #define VM_ALLOC_COUNT(count) ((count) << VM_ALLOC_COUNT_SHIFT) #ifdef M_NOWAIT static inline int malloc2vm_flags(int malloc_flags) { int pflags; KASSERT((malloc_flags & M_USE_RESERVE) == 0 || (malloc_flags & M_NOWAIT) != 0, ("M_USE_RESERVE requires M_NOWAIT")); pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM; if ((malloc_flags & M_ZERO) != 0) pflags |= VM_ALLOC_ZERO; if ((malloc_flags & M_NODUMP) != 0) pflags |= VM_ALLOC_NODUMP; return (pflags); } #endif void vm_page_busy_downgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared); void vm_page_flash(vm_page_t m); void vm_page_hold(vm_page_t mem); void vm_page_unhold(vm_page_t mem); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); void vm_page_activate (vm_page_t); void vm_page_advise(vm_page_t m, int advice); vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_page_t vm_page_alloc_freelist(int, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); -void vm_page_cache(vm_page_t); -void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t); -void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t); -int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t); vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); -boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex); +void vm_page_launder(vm_page_t m); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); struct vm_pagequeue *vm_page_pagequeue(vm_page_t m); vm_page_t vm_page_prev(vm_page_t m); boolean_t vm_page_ps_is_valid(vm_page_t m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); void vm_page_reference(vm_page_t m); void vm_page_remove (vm_page_t); int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex); void vm_page_requeue(vm_page_t m); void vm_page_requeue_locked(vm_page_t m); int vm_page_sbusied(vm_page_t m); vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); int vm_page_trysbusy(vm_page_t m); void vm_page_unhold_pages(vm_page_t *ma, int count); boolean_t vm_page_unwire(vm_page_t m, uint8_t queue); void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire (vm_page_t); void vm_page_xunbusy_hard(vm_page_t m); void vm_page_xunbusy_maybelocked(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int); int vm_page_is_valid (vm_page_t, int, int); void vm_page_test_dirty (vm_page_t); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); void vm_page_zero_idle_wakeup(void); void vm_page_dirty_KBI(vm_page_t m); void vm_page_lock_KBI(vm_page_t m, const char *file, int line); void vm_page_unlock_KBI(vm_page_t m, const char *file, int line); int vm_page_trylock_KBI(vm_page_t m, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line); void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line); #endif #define vm_page_assert_sbusied(m) \ KASSERT(vm_page_sbusied(m), \ ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_unbusied(m) \ KASSERT(!vm_page_busied(m), \ ("vm_page_assert_unbusied: page %p busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_assert_xbusied(m) \ KASSERT(vm_page_xbusied(m), \ ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \ (m), __FILE__, __LINE__)) #define vm_page_busied(m) \ ((m)->busy_lock != VPB_UNBUSIED) #define vm_page_sbusy(m) do { \ if (!vm_page_trysbusy(m)) \ panic("%s: page %p failed shared busying", __func__, \ (m)); \ } while (0) #define vm_page_tryxbusy(m) \ (atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED, \ VPB_SINGLE_EXCLUSIVER)) #define vm_page_xbusied(m) \ (((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0) #define vm_page_xbusy(m) do { \ if (!vm_page_tryxbusy(m)) \ panic("%s: page %p failed exclusive busying", __func__, \ (m)); \ } while (0) /* Note: page m's lock must not be owned by the caller. */ #define vm_page_xunbusy(m) do { \ if (!atomic_cmpset_rel_int(&(m)->busy_lock, \ VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) \ vm_page_xunbusy_hard(m); \ } while (0) #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m); #define VM_PAGE_OBJECT_LOCK_ASSERT(m) vm_page_object_lock_assert(m) void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits); #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) \ vm_page_assert_pga_writeable(m, bits) #else #define VM_PAGE_OBJECT_LOCK_ASSERT(m) (void)0 #define VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits) (void)0 #endif /* * We want to use atomic updates for the aflags field, which is 8 bits wide. * However, not all architectures support atomic operations on 8-bit * destinations. In order that we can easily use a 32-bit operation, we * require that the aflags field be 32-bit aligned. */ CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0); /* * Clear the given bits in the specified page. */ static inline void vm_page_aflag_clear(vm_page_t m, uint8_t bits) { uint32_t *addr, val; /* * The PGA_REFERENCED flag can only be cleared if the page is locked. */ if ((bits & PGA_REFERENCED) != 0) vm_page_assert_locked(m); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, ("vm_page_aflag_clear: aflags is misaligned")); val = bits; #if BYTE_ORDER == BIG_ENDIAN val <<= 24; #endif atomic_clear_32(addr, val); } /* * Set the given bits in the specified page. */ static inline void vm_page_aflag_set(vm_page_t m, uint8_t bits) { uint32_t *addr, val; VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits); /* * Access the whole 32-bit word containing the aflags field with an * atomic update. Parallel non-atomic updates to the other fields * within this word are handled properly by the atomic update. */ addr = (void *)&m->aflags; KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0, ("vm_page_aflag_set: aflags is misaligned")); val = bits; #if BYTE_ORDER == BIG_ENDIAN val <<= 24; #endif atomic_set_32(addr, val); } /* * vm_page_dirty: * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). */ static __inline void vm_page_dirty(vm_page_t m) { /* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */ #if defined(KLD_MODULE) || defined(INVARIANTS) vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; #endif } /* * vm_page_remque: * * If the given page is in a page queue, then remove it from that page * queue. * * The page must be locked. */ static inline void vm_page_remque(vm_page_t m) { if (m->queue != PQ_NONE) vm_page_dequeue(m); } /* * vm_page_undirty: * * Set page to not be dirty. Note: does not clear pmap modify bits */ static __inline void vm_page_undirty(vm_page_t m) { VM_PAGE_OBJECT_LOCK_ASSERT(m); m->dirty = 0; } static inline void vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret; mret = vm_page_replace(mnew, object, pindex); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); /* Unused if !INVARIANTS. */ (void)mold; (void)mret; +} + +static inline bool +vm_page_active(vm_page_t m) +{ + + return (m->queue == PQ_ACTIVE); +} + +static inline bool +vm_page_inactive(vm_page_t m) +{ + + return (m->queue == PQ_INACTIVE); +} + +static inline bool +vm_page_in_laundry(vm_page_t m) +{ + + return (m->queue == PQ_LAUNDRY); } #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: stable/11/sys/vm/vm_pageout.c =================================================================== --- stable/11/sys/vm/vm_pageout.c (revision 318715) +++ stable/11/sys/vm/vm_pageout.c (revision 318716) @@ -1,1859 +1,2268 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); -static int vm_pageout_clean(vm_page_t m); +static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif +/* Pagedaemon activity rates, in subdivisions of one second. */ +#define VM_LAUNDER_RATE 10 +#define VM_INACT_SCAN_RATE 2 int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ bool vm_pages_needed; /* Are threads waiting for free pages? */ +/* Pending request for dirty page laundering. */ +static enum { + VM_LAUNDRY_IDLE, + VM_LAUNDRY_BACKGROUND, + VM_LAUNDRY_SHORTFALL +} vm_laundry_request = VM_LAUNDRY_IDLE; + #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif -static int vm_max_launder = 32; static int vm_pageout_update_period; -static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; #if defined(NO_SWAPPING) static int vm_swap_enabled = 0; static int vm_swap_idle_enabled = 0; #else static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); -SYSCTL_INT(_vm, OID_AUTO, max_launder, - CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); - SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, "Low memory callback period"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif -SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, - CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); - SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); +static int act_scan_laundry_weight = 3; +SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RW, + &act_scan_laundry_weight, 0, + "weight given to clean vs. dirty pages in active queue scans"); + +static u_int vm_background_launder_target; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RW, + &vm_background_launder_target, 0, + "background laundering target, in pages"); + +static u_int vm_background_launder_rate = 4096; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RW, + &vm_background_launder_rate, 0, + "background laundering rate, in kilobytes per second"); + +static u_int vm_background_launder_max = 20 * 1024; +SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RW, + &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); + #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); +static u_int isqrt(u_int num); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); +static int vm_pageout_launder(struct vm_domain *vmd, int launder, + bool in_shortfall); +static void vm_pageout_laundry_worker(void *arg); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* * Initialize a dummy page for marking the caller's place in the specified * paging queue. In principle, this function only needs to set the flag * PG_MARKER. Nonetheless, it write busies and initializes the hold count * to one as safety precautions. */ static void vm_pageout_init_marker(vm_page_t marker, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } /* * vm_pageout_fallback_object_lock: * * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is * known to have failed and page queue must be either PQ_ACTIVE or * PQ_INACTIVE. To avoid lock order violation, unlock the page queue * while locking the vm object. Use marker page to detect page queue * changes and maintain notion of next page on page queue. Return * TRUE if no changes were detected, FALSE otherwise. vm object is * locked on return. * * This function depends on both the lock portion of struct vm_object * and normal struct vm_page being type stable. */ static boolean_t vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_object_t object; queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_unlock(m); VM_OBJECT_WLOCK(object); vm_page_lock(m); vm_pagequeue_lock(pq); /* * The page's object might have changed, and/or the page might * have moved from its original position in the queue. If the * page's object has changed, then the caller should abandon * processing the page because the wrong object lock was * acquired. Use the marker's plinks.q, not the page's, to * determine if the page has been moved. The state of the * page's plinks.q can be indeterminate; whereas, the marker's * plinks.q must be valid. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m->object == object && m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Lock the page while holding the page queue lock. Use marker page * to detect page queue changes and maintain notion of next page on * page queue. Return TRUE if no changes were detected, FALSE * otherwise. The page is locked on return. The page queue lock might * be dropped and reacquired. * * This function depends on normal struct vm_page being type stable. */ static boolean_t vm_pageout_page_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_page_lock_assert(m, MA_NOTOWNED); if (vm_page_trylock(m)) return (TRUE); queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_lock(m); vm_pagequeue_lock(pq); /* Page queue might have changed. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; /* * We can't clean the page if it is busy or held. */ vm_page_assert_unbusied(m); KASSERT(m->hold_count == 0, ("page %p is held", m)); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and - * the page is inactive. + * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; break; } vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; vm_page_lock(p); - if (p->queue != PQ_INACTIVE || + if (!vm_page_in_laundry(p) || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; } vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); vm_page_sbusy(mc[i]); pmap_remove_write(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); + /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* - * Page outside of range of object. Right now we - * essentially lose the changes by pretending it - * worked. + * The page is outside the object's range. We pretend + * that the page out worked and clean the page, so the + * changes will be lost if the page is reclaimed by + * the page daemon. */ vm_page_undirty(mt); + vm_page_lock(mt); + if (vm_page_in_laundry(mt)) + vm_page_deactivate_noreuse(mt); + vm_page_unlock(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* - * If page couldn't be paged out, then reactivate the - * page so it doesn't clog the inactive list. (We - * will try paging out it again later). + * If the page couldn't be paged out, then reactivate + * it so that it doesn't clog the laundry and inactive + * queues. (We will try paging it out again later). */ vm_page_lock(mt); vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || object->paging_in_progress != 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (vm_page_busied(p)) continue; PCPU_INC(cnt.v_pdpages); vm_page_lock(p); if (p->wire_count != 0 || p->hold_count != 0 || !pmap_page_exists_quick(pmap, p)) { vm_page_unlock(p); continue; } act_delta = pmap_ts_referenced(p); if ((p->aflags & PGA_REFERENCED) != 0) { if (act_delta == 0) act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } - if (p->queue != PQ_ACTIVE && act_delta != 0) { + if (!vm_page_active(p) && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; - } else if (p->queue == PQ_ACTIVE) { + } else if (vm_page_active(p)) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && p->act_count == 0) { pmap_remove_all(p); vm_page_deactivate(p); } else vm_page_requeue(p); } else { vm_page_activate(p); if (p->act_count < ACT_MAX - ACT_ADVANCE) p->act_count += ACT_ADVANCE; vm_page_requeue(p); } - } else if (p->queue == PQ_INACTIVE) + } else if (vm_page_inactive(p)) pmap_remove_all(p); vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int -vm_pageout_clean(vm_page_t m) +vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ - if (m->queue != PQ_INACTIVE || m->object != object || + if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied or held while the object * and page locks were released. */ if (vm_page_busied(m) || m->hold_count != 0) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ - if (vm_pageout_cluster(m) == 0) + if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* + * Attempt to launder the specified number of pages. + * + * Returns the number of pages successfully laundered. + */ +static int +vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) +{ + struct vm_pagequeue *pq; + vm_object_t object; + vm_page_t m, next; + int act_delta, error, maxscan, numpagedout, starting_target; + int vnodes_skipped; + bool pageout_ok, queue_locked; + + starting_target = launder; + vnodes_skipped = 0; + + /* + * Scan the laundry queue for pages eligible to be laundered. We stop + * once the target number of dirty pages have been laundered, or once + * we've reached the end of the queue. A single iteration of this loop + * may cause more than one page to be laundered because of clustering. + * + * maxscan ensures that we don't re-examine requeued pages. Any + * additional pages written as part of a cluster are subtracted from + * maxscan since they must be taken from the laundry queue. + */ + pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; + maxscan = pq->pq_cnt; + + vm_pagequeue_lock(pq); + queue_locked = true; + for (m = TAILQ_FIRST(&pq->pq_pl); + m != NULL && maxscan-- > 0 && launder > 0; + m = next) { + vm_pagequeue_assert_locked(pq); + KASSERT(queue_locked, ("unlocked laundry queue")); + KASSERT(vm_page_in_laundry(m), + ("page %p has an inconsistent queue", m)); + next = TAILQ_NEXT(m, plinks.q); + if ((m->flags & PG_MARKER) != 0) + continue; + KASSERT((m->flags & PG_FICTITIOUS) == 0, + ("PG_FICTITIOUS page %p cannot be in laundry queue", m)); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("VPO_UNMANAGED page %p cannot be in laundry queue", m)); + if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { + vm_page_unlock(m); + continue; + } + object = m->object; + if ((!VM_OBJECT_TRYWLOCK(object) && + (!vm_pageout_fallback_object_lock(m, &next) || + m->hold_count != 0)) || vm_page_busied(m)) { + VM_OBJECT_WUNLOCK(object); + vm_page_unlock(m); + continue; + } + + /* + * Unlock the laundry queue, invalidating the 'next' pointer. + * Use a marker to remember our place in the laundry queue. + */ + TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker, + plinks.q); + vm_pagequeue_unlock(pq); + queue_locked = false; + + /* + * Invalid pages can be easily freed. They cannot be + * mapped; vm_page_free() asserts this. + */ + if (m->valid == 0) + goto free_page; + + /* + * If the page has been referenced and the object is not dead, + * reactivate or requeue the page depending on whether the + * object is mapped. + */ + if ((m->aflags & PGA_REFERENCED) != 0) { + vm_page_aflag_clear(m, PGA_REFERENCED); + act_delta = 1; + } else + act_delta = 0; + if (object->ref_count != 0) + act_delta += pmap_ts_referenced(m); + else { + KASSERT(!pmap_page_is_mapped(m), + ("page %p is mapped", m)); + } + if (act_delta != 0) { + if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); + vm_page_activate(m); + + /* + * Increase the activation count if the page + * was referenced while in the laundry queue. + * This makes it less likely that the page will + * be returned prematurely to the inactive + * queue. + */ + m->act_count += act_delta + ACT_ADVANCE; + + /* + * If this was a background laundering, count + * activated pages towards our target. The + * purpose of background laundering is to ensure + * that pages are eventually cycled through the + * laundry queue, and an activation is a valid + * way out. + */ + if (!in_shortfall) + launder--; + goto drop_page; + } else if ((object->flags & OBJ_DEAD) == 0) + goto requeue_page; + } + + /* + * If the page appears to be clean at the machine-independent + * layer, then remove all of its mappings from the pmap in + * anticipation of freeing it. If, however, any of the page's + * mappings allow write access, then the page may still be + * modified until the last of those mappings are removed. + */ + if (object->ref_count != 0) { + vm_page_test_dirty(m); + if (m->dirty == 0) + pmap_remove_all(m); + } + + /* + * Clean pages are freed, and dirty pages are paged out unless + * they belong to a dead object. Requeueing dirty pages from + * dead objects is pointless, as they are being paged out and + * freed by the thread that destroyed the object. + */ + if (m->dirty == 0) { +free_page: + vm_page_free(m); + PCPU_INC(cnt.v_dfree); + } else if ((object->flags & OBJ_DEAD) == 0) { + if (object->type != OBJT_SWAP && + object->type != OBJT_DEFAULT) + pageout_ok = true; + else if (disable_swap_pageouts) + pageout_ok = false; + else + pageout_ok = true; + if (!pageout_ok) { +requeue_page: + vm_pagequeue_lock(pq); + queue_locked = true; + vm_page_requeue_locked(m); + goto drop_page; + } + + /* + * Form a cluster with adjacent, dirty pages from the + * same object, and page out that entire cluster. + * + * The adjacent, dirty pages must also be in the + * laundry. However, their mappings are not checked + * for new references. Consequently, a recently + * referenced page may be paged out. However, that + * page will not be prematurely reclaimed. After page + * out, the page will be placed in the inactive queue, + * where any new references will be detected and the + * page reactivated. + */ + error = vm_pageout_clean(m, &numpagedout); + if (error == 0) { + launder -= numpagedout; + maxscan -= numpagedout - 1; + } else if (error == EDEADLK) { + pageout_lock_miss++; + vnodes_skipped++; + } + goto relock_queue; + } +drop_page: + vm_page_unlock(m); + VM_OBJECT_WUNLOCK(object); +relock_queue: + if (!queue_locked) { + vm_pagequeue_lock(pq); + queue_locked = true; + } + next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q); + TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q); + } + vm_pagequeue_unlock(pq); + + /* + * Wakeup the sync daemon if we skipped a vnode in a writeable object + * and we didn't launder enough pages. + */ + if (vnodes_skipped > 0 && launder > 0) + (void)speedup_syncer(); + + return (starting_target - launder); +} + +/* + * Compute the integer square root. + */ +static u_int +isqrt(u_int num) +{ + u_int bit, root, tmp; + + bit = 1u << ((NBBY * sizeof(u_int)) - 2); + while (bit > num) + bit >>= 2; + root = 0; + while (bit != 0) { + tmp = root + bit; + root >>= 1; + if (num >= tmp) { + num -= tmp; + root += bit; + } + bit >>= 2; + } + return (root); +} + +/* + * Perform the work of the laundry thread: periodically wake up and determine + * whether any pages need to be laundered. If so, determine the number of pages + * that need to be laundered, and launder them. + */ +static void +vm_pageout_laundry_worker(void *arg) +{ + struct vm_domain *domain; + struct vm_pagequeue *pq; + uint64_t nclean, ndirty; + u_int last_launder, wakeups; + int domidx, last_target, launder, shortfall, shortfall_cycle, target; + bool in_shortfall; + + domidx = (uintptr_t)arg; + domain = &vm_dom[domidx]; + pq = &domain->vmd_pagequeues[PQ_LAUNDRY]; + KASSERT(domain->vmd_segs != 0, ("domain without segments")); + vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY); + + shortfall = 0; + in_shortfall = false; + shortfall_cycle = 0; + target = 0; + last_launder = 0; + + /* + * The pageout laundry worker is never done, so loop forever. + */ + for (;;) { + KASSERT(target >= 0, ("negative target %d", target)); + KASSERT(shortfall_cycle >= 0, + ("negative cycle %d", shortfall_cycle)); + launder = 0; + wakeups = VM_METER_PCPU_CNT(v_pdwakeups); + + /* + * First determine whether we need to launder pages to meet a + * shortage of free pages. + */ + if (shortfall > 0) { + in_shortfall = true; + shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; + target = shortfall; + } else if (!in_shortfall) + goto trybackground; + else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) { + /* + * We recently entered shortfall and began laundering + * pages. If we have completed that laundering run + * (and we are no longer in shortfall) or we have met + * our laundry target through other activity, then we + * can stop laundering pages. + */ + in_shortfall = false; + target = 0; + goto trybackground; + } + last_launder = wakeups; + launder = target / shortfall_cycle--; + goto dolaundry; + + /* + * There's no immediate need to launder any pages; see if we + * meet the conditions to perform background laundering: + * + * 1. The ratio of dirty to clean inactive pages exceeds the + * background laundering threshold and the pagedaemon has + * been woken up to reclaim pages since our last + * laundering, or + * 2. we haven't yet reached the target of the current + * background laundering run. + * + * The background laundering threshold is not a constant. + * Instead, it is a slowly growing function of the number of + * page daemon wakeups since the last laundering. Thus, as the + * ratio of dirty to clean inactive pages grows, the amount of + * memory pressure required to trigger laundering decreases. + */ +trybackground: + nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count; + ndirty = vm_cnt.v_laundry_count; + if (target == 0 && wakeups != last_launder && + ndirty * isqrt(wakeups - last_launder) >= nclean) { + target = vm_background_launder_target; + } + + /* + * We have a non-zero background laundering target. If we've + * laundered up to our maximum without observing a page daemon + * wakeup, just stop. This is a safety belt that ensures we + * don't launder an excessive amount if memory pressure is low + * and the ratio of dirty to clean pages is large. Otherwise, + * proceed at the background laundering rate. + */ + if (target > 0) { + if (wakeups != last_launder) { + last_launder = wakeups; + last_target = target; + } else if (last_target - target >= + vm_background_launder_max * PAGE_SIZE / 1024) { + target = 0; + } + launder = vm_background_launder_rate * PAGE_SIZE / 1024; + launder /= VM_LAUNDER_RATE; + if (launder > target) + launder = target; + } + +dolaundry: + if (launder > 0) { + /* + * Because of I/O clustering, the number of laundered + * pages could exceed "target" by the maximum size of + * a cluster minus one. + */ + target -= min(vm_pageout_launder(domain, launder, + in_shortfall), target); + pause("laundp", hz / VM_LAUNDER_RATE); + } + + /* + * If we're not currently laundering pages and the page daemon + * hasn't posted a new request, sleep until the page daemon + * kicks us. + */ + vm_pagequeue_lock(pq); + if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE) + (void)mtx_sleep(&vm_laundry_request, + vm_pagequeue_lockptr(pq), PVM, "launds", 0); + + /* + * If the pagedaemon has indicated that it's in shortfall, start + * a shortfall laundering unless we're already in the middle of + * one. This may preempt a background laundering. + */ + if (vm_laundry_request == VM_LAUNDRY_SHORTFALL && + (!in_shortfall || shortfall_cycle == 0)) { + shortfall = vm_laundry_target() + vm_pageout_deficit; + target = 0; + } else + shortfall = 0; + + if (target == 0) + vm_laundry_request = VM_LAUNDRY_IDLE; + vm_pagequeue_unlock(pq); + } +} + +/* * vm_pageout_scan does the dirty work for the pageout daemon. * - * pass 0 - Update active LRU/deactivate pages - * pass 1 - Free inactive pages - * pass 2 - Launder dirty pages + * pass == 0: Update active LRU/deactivate pages + * pass >= 1: Free inactive pages * * Returns true if pass was zero or enough pages were freed by the inactive * queue scan to meet the target. */ static bool vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; long min_scan; - int act_delta, addl_page_shortage, deficit, error, inactq_shortage; - int maxlaunder, maxscan, page_shortage, scan_tick, scanned; - int starting_page_shortage, vnodes_skipped; - boolean_t pageout_ok, queue_locked; + int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan; + int page_shortage, scan_tick, scanned, starting_page_shortage; + boolean_t queue_locked; /* * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(); lowmem_uptime = time_uptime; } /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Calculate the number of pages that we want to free. This number * can be negative if many pages are freed between the wakeup call to * the page daemon and this calculation. */ if (pass > 0) { deficit = atomic_readandclear_int(&vm_pageout_deficit); page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; /* - * maxlaunder limits the number of dirty pages we flush per scan. - * For most systems a smaller value (16 or 32) is more robust under - * extreme memory and disk pressure because any unnecessary writes - * to disk can result in extreme performance degredation. However, - * systems with excessive dirty pages (especially when MAP_NOSYNC is - * used) will die horribly with limited laundering. If the pageout - * daemon cannot clean enough pages in the first pass, we let it go - * all out in succeeding passes. - */ - if ((maxlaunder = vm_max_launder) <= 1) - maxlaunder = 1; - if (pass > 1) - maxlaunder = 10000; - - vnodes_skipped = 0; - - /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queue_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); - KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); + KASSERT(vm_page_inactive(m), ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in inactive queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in inactive queue", m)); /* * The page or object lock acquisitions fail if the * page was removed from the queue or moved to a * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ if (!vm_pageout_page_lock(m, &next)) goto unlock_page; else if (m->hold_count != 0) { /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted * from the inactive count. See the * calculation of inactq_shortage before the * loop over the active queue below. */ addl_page_shortage++; goto unlock_page; } object = m->object; if (!VM_OBJECT_TRYWLOCK(object)) { if (!vm_pageout_fallback_object_lock(m, &next)) goto unlock_object; else if (m->hold_count != 0) { addl_page_shortage++; goto unlock_object; } } if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; unlock_object: VM_OBJECT_WUNLOCK(object); unlock_page: vm_page_unlock(m); continue; } KASSERT(m->hold_count == 0, ("Held page %p", m)); /* - * We unlock the inactive page queue, invalidating the - * 'next' pointer. Use our marker to remember our - * place. + * Dequeue the inactive page and unlock the inactive page + * queue, invalidating the 'next' pointer. Dequeueing the + * page here avoids a later reacquisition (and release) of + * the inactive page queue lock when vm_page_activate(), + * vm_page_free(), or vm_page_launder() is called. Use a + * marker to remember our place in the inactive queue. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); + vm_page_dequeue_locked(m); vm_pagequeue_unlock(pq); queue_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { + PCPU_INC(cnt.v_reactivated); vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the inactive queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; - } else if ((object->flags & OBJ_DEAD) == 0) - goto requeue_page; + } else if ((object->flags & OBJ_DEAD) == 0) { + vm_pagequeue_lock(pq); + queue_locked = TRUE; + m->queue = PQ_INACTIVE; + TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); + vm_pagequeue_cnt_inc(pq); + goto drop_page; + } } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } + /* + * Clean pages can be freed, but dirty pages must be sent back + * to the laundry, unless they belong to a dead object. + * Requeueing dirty pages from dead objects is pointless, as + * they are being paged out and freed by the thread that + * destroyed the object. + */ if (m->dirty == 0) { - /* - * Clean pages can be freed. - */ free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; - } else if ((object->flags & OBJ_DEAD) != 0) { - /* - * Leave dirty pages from dead objects at the front of - * the queue. They are being paged out and freed by - * the thread that destroyed the object. They will - * leave the queue shortly after the scan finishes, so - * they should be discounted from the inactive count. - */ - addl_page_shortage++; - } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { - /* - * Dirty pages need to be paged out, but flushing - * a page is extremely expensive versus freeing - * a clean page. Rather then artificially limiting - * the number of pages we can flush, we instead give - * dirty pages extra priority on the inactive queue - * by forcing them to be cycled through the queue - * twice before being flushed, after which the - * (now clean) page will cycle through once more - * before being freed. This significantly extends - * the thrash point for a heavily loaded machine. - */ - m->flags |= PG_WINATCFLS; -requeue_page: - vm_pagequeue_lock(pq); - queue_locked = TRUE; - vm_page_requeue_locked(m); - } else if (maxlaunder > 0) { - /* - * We always want to try to flush some dirty pages if - * we encounter them, to keep the system stable. - * Normally this number is small, but under extreme - * pressure where there are insufficient clean pages - * on the inactive queue, we may have to go all out. - */ - - if (object->type != OBJT_SWAP && - object->type != OBJT_DEFAULT) - pageout_ok = TRUE; - else if (disable_swap_pageouts) - pageout_ok = FALSE; - else if (defer_swap_pageouts) - pageout_ok = vm_page_count_min(); - else - pageout_ok = TRUE; - if (!pageout_ok) - goto requeue_page; - error = vm_pageout_clean(m); - /* - * Decrement page_shortage on success to account for - * the (future) cleaned page. Otherwise we could wind - * up laundering or cleaning too many pages. - */ - if (error == 0) { - page_shortage--; - maxlaunder--; - } else if (error == EDEADLK) { - pageout_lock_miss++; - vnodes_skipped++; - } else if (error == EBUSY) { - addl_page_shortage++; - } - vm_page_lock_assert(m, MA_NOTOWNED); - goto relock_queue; - } + } else if ((object->flags & OBJ_DEAD) == 0) + vm_page_launder(m); drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); -relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = TRUE; } next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } vm_pagequeue_unlock(pq); + /* + * Wake up the laundry thread so that it can perform any needed + * laundering. If we didn't meet our target, we're in shortfall and + * need to launder more aggressively. + */ + if (vm_laundry_request == VM_LAUNDRY_IDLE && + starting_page_shortage > 0) { + pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY]; + vm_pagequeue_lock(pq); + if (page_shortage > 0) { + vm_laundry_request = VM_LAUNDRY_SHORTFALL; + PCPU_INC(cnt.v_pdshortfalls); + } else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL) + vm_laundry_request = VM_LAUNDRY_BACKGROUND; + wakeup(&vm_laundry_request); + vm_pagequeue_unlock(pq); + } + #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (vm_swap_enabled && page_shortage > 0) vm_req_vmdaemon(VM_SWAP_NORMAL); #endif /* - * Wakeup the sync daemon if we skipped a vnode in a writeable object - * and we didn't free enough pages. - */ - if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - - vm_cnt.v_free_min) - (void)speedup_syncer(); - - /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Compute the number of pages we want to try to move from the - * active queue to the inactive queue. + * active queue to either the inactive or laundry queue. + * + * When scanning active pages, we make clean pages count more heavily + * towards the page shortage than dirty pages. This is because dirty + * pages must be laundered before they can be reused and thus have less + * utility when attempting to quickly alleviate a shortage. However, + * this weighting also causes the scan to deactivate dirty pages more + * more aggressively, improving the effectiveness of clustering and + * ensuring that they can eventually be reused. */ - inactq_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + + inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count + + vm_cnt.v_laundry_count / act_scan_laundry_weight) + vm_paging_target() + deficit + addl_page_shortage; + page_shortage *= act_scan_laundry_weight; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in active queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in active queue", m)); if (!vm_pageout_page_lock(m, &next)) { vm_page_unlock(m); continue; } /* * The count for page daemon pages is updated after checking * the page for eligibility. */ PCPU_INC(cnt.v_pdpages); /* * Check to see "how much" the page has been used. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; /* * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ if (m->object->ref_count != 0) act_delta += pmap_ts_referenced(m); /* * Advance or decay the act_count based on recent usage. */ if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } else m->act_count -= min(m->act_count, ACT_DECLINE); /* - * Move this page to the tail of the active or inactive + * Move this page to the tail of the active, inactive or laundry * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); - vm_page_deactivate(m); - inactq_shortage--; + + /* + * When not short for inactive pages, let dirty pages go + * through the inactive queue before moving to the + * laundry queues. This gives them some extra time to + * be reactivated, potentially avoiding an expensive + * pageout. During a page shortage, the inactive queue + * is necessarily small, so we may move dirty pages + * directly to the laundry queue. + */ + if (inactq_shortage <= 0) + vm_page_deactivate(m); + else { + /* + * Calling vm_page_test_dirty() here would + * require acquisition of the object's write + * lock. However, during a page shortage, + * directing dirty pages into the laundry + * queue is only an optimization and not a + * requirement. Therefore, we simply rely on + * the opportunistic updates to the page's + * dirty field by the pmap. + */ + if (m->dirty == 0) { + vm_page_deactivate(m); + inactq_shortage -= + act_scan_laundry_weight; + } else { + vm_page_launder(m); + inactq_shortage--; + } + } } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second. */ if (vm_swap_idle_enabled) { static long lsec; if (time_second != lsec) { vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } } #endif return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of it's child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); wakeup(&vm_cnt.v_free_count); } } static void vm_pageout_worker(void *arg) { struct vm_domain *domain; int domidx, pass; bool target_met; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; pass = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(domain->vmd_segs != 0, ("domain without segments")); domain->vmd_last_active_scan = ticks; vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, &domain->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { mtx_lock(&vm_page_queue_free_mtx); /* * Generally, after a level >= 1 scan, if there are enough * free pages to wakeup the waiters, then they are already * awake. A call to vm_page_free() during the scan awakened * them. However, in the following case, this wakeup serves * to bound the amount of time that a thread might wait. * Suppose a thread's call to vm_page_alloc() fails, but * before that thread calls VM_WAIT, enough pages are freed by * other threads to alleviate the free page shortage. The * thread will, nonetheless, wait until another page is freed * or this wakeup is performed. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } /* * Do not clear vm_pageout_wanted until we reach our free page * target. Otherwise, we may be awakened over and over again, * wasting CPU time. */ if (vm_pageout_wanted && target_met) vm_pageout_wanted = false; /* * Might the page daemon receive a wakeup call? */ if (vm_pageout_wanted) { /* * No. Either vm_pageout_wanted was set by another * thread during the previous scan, which must have * been a level 0 scan, or vm_pageout_wanted was * already set and the scan failed to free enough - * pages. If we haven't yet performed a level >= 2 - * scan (unlimited dirty cleaning), then upgrade the - * level and scan again now. Otherwise, sleep a bit - * and try again later. + * pages. If we haven't yet performed a level >= 1 + * (page reclamation) scan, then increase the level + * and scan again now. Otherwise, sleep a bit and + * try again later. */ mtx_unlock(&vm_page_queue_free_mtx); - if (pass > 1) - pause("psleep", hz / 2); + if (pass >= 1) + pause("psleep", hz / VM_INACT_SCAN_RATE); pass++; } else { /* * Yes. Sleep until pages need to be reclaimed or * have their reference stats updated. */ if (mtx_sleep(&vm_pageout_wanted, &vm_page_queue_free_mtx, PDROP | PVM, "psleep", hz) == 0) { PCPU_INC(cnt.v_pdwakeups); pass = 1; } else pass = 0; } target_met = vm_pageout_scan(domain, pass); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init(void) { /* * Initialize some paging parameters. */ vm_cnt.v_interrupt_free_min = 2; if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vm_cnt.v_page_count > 1024) vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; else vm_cnt.v_free_min = 4; vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + vm_cnt.v_interrupt_free_min; vm_cnt.v_free_reserved = vm_pageout_page_count + vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; vm_cnt.v_free_min += vm_cnt.v_free_reserved; vm_cnt.v_free_severe += vm_cnt.v_free_reserved; vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; + + /* + * Target amount of memory to move out of the laundry queue during a + * background laundering. This is proportional to the amount of system + * memory. + */ + vm_background_launder_target = (vm_cnt.v_free_target - + vm_cnt.v_free_min) / 10; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { int error; #ifdef VM_NUMA_ALLOC int i; #endif swap_pager_swap_init(); + error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL, + 0, 0, "laundry: dom0"); + if (error != 0) + panic("starting laundry for domain 0, error %d", error); #ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { panic("starting pageout for domain %d, error %d\n", i, error); } } #endif error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); vm_pageout_worker((void *)(uintptr_t)0); } /* * Unless the free page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &vm_cnt.v_free_count following this function unless * the free page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup(void) { if (!vm_pageout_wanted && curthread->td_proc != pageproc) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags) swapout_procs(swapout_flags); /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); _PHOLD_LITE(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); continue; } sx_sunlock(&allproc_lock); size = vmspace_resident_count(vm); if (size >= limit) { vm_pageout_map_deactivate_pages( &vm->vm_map, limit); size = vmspace_resident_count(vm); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_pageout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) goto again; } } #endif /* !defined(NO_SWAPPING) */ Index: stable/11/sys/vm/vm_phys.c =================================================================== --- stable/11/sys/vm/vm_phys.c (revision 318715) +++ stable/11/sys/vm/vm_phys.c (revision 318716) @@ -1,1530 +1,1530 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Physical memory system implementation * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, "Too many physsegs."); #ifdef VM_NUMA_ALLOC struct mem_affinity *mem_affinity; int *mem_locality; #endif int vm_ndomains = 1; struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; struct vm_phys_fictitious_seg; static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *, struct vm_phys_fictitious_seg *); RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree = RB_INITIALIZER(_vm_phys_fictitious_tree); struct vm_phys_fictitious_seg { RB_ENTRY(vm_phys_fictitious_seg) node; /* Memory region data */ vm_paddr_t start; vm_paddr_t end; vm_page_t first_page; }; RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node, vm_phys_fictitious_cmp); static struct rwlock vm_phys_fictitious_reg_lock; MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages"); static struct vm_freelist vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER]; static int vm_nfreelists; /* * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int vm_freelist_to_flind[VM_NFREELIST]; CTASSERT(VM_FREELIST_DEFAULT == 0); #ifdef VM_FREELIST_ISADMA #define VM_ISADMA_BOUNDARY 16777216 #endif #ifdef VM_FREELIST_DMA32 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32) #endif /* * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about * the ordering of the free list boundaries. */ #if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY) CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY); #endif #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY) CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY); #endif static int cnt_prezero; SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, "The number of physical pages prezeroed at idle time"); static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_free, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_free, "A", "Phys Free Info"); static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info"); #ifdef VM_NUMA_ALLOC static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info"); #endif SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD, &vm_ndomains, 0, "Number of physical memory domains available."); /* * Default to first-touch + round-robin. */ static struct mtx vm_default_policy_mtx; MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex", MTX_DEF); #ifdef VM_NUMA_ALLOC static struct vm_domain_policy vm_default_policy = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); #else /* Use round-robin so the domain policy code will only try once per allocation */ static struct vm_domain_policy vm_default_policy = VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0); #endif static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order); static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain); static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end); static int vm_phys_paddr_to_segind(vm_paddr_t pa); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); static int sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS) { char policy_name[32]; int error; mtx_lock(&vm_default_policy_mtx); /* Map policy to output string */ switch (vm_default_policy.p.policy) { case VM_POLICY_FIRST_TOUCH: strcpy(policy_name, "first-touch"); break; case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN: strcpy(policy_name, "first-touch-rr"); break; case VM_POLICY_ROUND_ROBIN: default: strcpy(policy_name, "rr"); break; } mtx_unlock(&vm_default_policy_mtx); error = sysctl_handle_string(oidp, &policy_name[0], sizeof(policy_name), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&vm_default_policy_mtx); /* Set: match on the subset of policies that make sense as a default */ if (strcmp("first-touch-rr", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0); } else if (strcmp("first-touch", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_FIRST_TOUCH, 0); } else if (strcmp("rr", policy_name) == 0) { vm_domain_policy_set(&vm_default_policy, VM_POLICY_ROUND_ROBIN, 0); } else { error = EINVAL; goto finish; } error = 0; finish: mtx_unlock(&vm_default_policy_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_vm_default_policy, "A", "Default policy (rr, first-touch, first-touch-rr"); /* * Red-black tree helpers for vm fictitious range management. */ static inline int vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p, struct vm_phys_fictitious_seg *range) { KASSERT(range->start != 0 && range->end != 0, ("Invalid range passed on search for vm_fictitious page")); if (p->start >= range->end) return (1); if (p->start < range->start) return (-1); return (0); } static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1, struct vm_phys_fictitious_seg *p2) { /* Check if this is a search for a page */ if (p1->end == 0) return (vm_phys_fictitious_in_range(p1, p2)); KASSERT(p2->end != 0, ("Invalid range passed as second parameter to vm fictitious comparison")); /* Searching to add a new range */ if (p1->end <= p2->start) return (-1); if (p1->start >= p2->end) return (1); panic("Trying to add overlapping vm fictitious ranges:\n" "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start, (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end); } static __inline int vm_rr_selectdomain(void) { #ifdef VM_NUMA_ALLOC struct thread *td; td = curthread; td->td_dom_rr_idx++; td->td_dom_rr_idx %= vm_ndomains; return (td->td_dom_rr_idx); #else return (0); #endif } /* * Initialise a VM domain iterator. * * Check the thread policy, then the proc policy, * then default to the system policy. * * Later on the various layers will have this logic * plumbed into them and the phys code will be explicitly * handed a VM domain policy to use. */ static void vm_policy_iterator_init(struct vm_domain_iterator *vi) { #ifdef VM_NUMA_ALLOC struct vm_domain_policy lcl; #endif vm_domain_iterator_init(vi); #ifdef VM_NUMA_ALLOC /* Copy out the thread policy */ vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Thread policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } vm_domain_policy_localcopy(&lcl, &curthread->td_proc->p_vm_dom_policy); if (lcl.p.policy != VM_POLICY_NONE) { /* Process policy is present; use it */ vm_domain_iterator_set_policy(vi, &lcl); return; } #endif /* Use system default policy */ vm_domain_iterator_set_policy(vi, &vm_default_policy); } static void vm_policy_iterator_finish(struct vm_domain_iterator *vi) { vm_domain_iterator_cleanup(vi); } boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high) { struct vm_phys_seg *s; int idx; while ((idx = ffsl(mask)) != 0) { idx--; /* ffsl counts from 1 */ mask &= ~(1UL << idx); s = &vm_phys_segs[idx]; if (low < s->end && high > s->start) return (TRUE); } return (FALSE); } /* * Outputs the state of the physical memory allocator, specifically, * the amount of physical memory in each free list. */ static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_freelist *fl; int dom, error, flind, oind, pind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); for (dom = 0; dom < vm_ndomains; dom++) { sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { sbuf_printf(&sbuf, "\nFREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, " | POOL %d", pind); sbuf_printf(&sbuf, "\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) sbuf_printf(&sbuf, "-- -- "); sbuf_printf(&sbuf, "--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { sbuf_printf(&sbuf, " %2d (%6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; sbuf_printf(&sbuf, " | %6d", fl[oind].lcnt); } sbuf_printf(&sbuf, "\n"); } } } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Outputs the set of physical memory segments. */ static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct vm_phys_seg *seg; int error, segind; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); for (segind = 0; segind < vm_phys_nsegs; segind++) { sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind); seg = &vm_phys_segs[segind]; sbuf_printf(&sbuf, "start: %#jx\n", (uintmax_t)seg->start); sbuf_printf(&sbuf, "end: %#jx\n", (uintmax_t)seg->end); sbuf_printf(&sbuf, "domain: %d\n", seg->domain); sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Return affinity, or -1 if there's no affinity information. */ int vm_phys_mem_affinity(int f, int t) { #ifdef VM_NUMA_ALLOC if (mem_locality == NULL) return (-1); if (f >= vm_ndomains || t >= vm_ndomains) return (-1); return (mem_locality[f * vm_ndomains + t]); #else return (-1); #endif } #ifdef VM_NUMA_ALLOC /* * Outputs the VM locality table. */ static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; int error, i, j; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\n"); for (i = 0; i < vm_ndomains; i++) { sbuf_printf(&sbuf, "%d: ", i); for (j = 0; j < vm_ndomains; j++) { sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j)); } sbuf_printf(&sbuf, "\n"); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } #endif static void vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail) { m->order = order; if (tail) TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q); else TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q); fl[order].lcnt++; } static void vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order) { TAILQ_REMOVE(&fl[order].pl, m, plinks.q); fl[order].lcnt--; m->order = VM_NFREEORDER; } /* * Create a physical memory segment. */ static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain) { struct vm_phys_seg *seg; KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX, ("vm_phys_create_seg: increase VM_PHYSSEG_MAX")); KASSERT(domain < vm_ndomains, ("vm_phys_create_seg: invalid domain provided")); seg = &vm_phys_segs[vm_phys_nsegs++]; while (seg > vm_phys_segs && (seg - 1)->start >= end) { *seg = *(seg - 1); seg--; } seg->start = start; seg->end = end; seg->domain = domain; } static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end) { #ifdef VM_NUMA_ALLOC int i; if (mem_affinity == NULL) { _vm_phys_create_seg(start, end, 0); return; } for (i = 0;; i++) { if (mem_affinity[i].end == 0) panic("Reached end of affinity info"); if (mem_affinity[i].end <= start) continue; if (mem_affinity[i].start > start) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { _vm_phys_create_seg(start, end, mem_affinity[i].domain); break; } _vm_phys_create_seg(start, mem_affinity[i].end, mem_affinity[i].domain); start = mem_affinity[i].end; } #else _vm_phys_create_seg(start, end, 0); #endif } /* * Add a physical memory segment. */ void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end) { vm_paddr_t paddr; KASSERT((start & PAGE_MASK) == 0, ("vm_phys_define_seg: start is not page aligned")); KASSERT((end & PAGE_MASK) == 0, ("vm_phys_define_seg: end is not page aligned")); /* * Split the physical memory segment if it spans two or more free * list boundaries. */ paddr = start; #ifdef VM_FREELIST_ISADMA if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) { vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY); paddr = VM_ISADMA_BOUNDARY; } #endif #ifdef VM_FREELIST_LOWMEM if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) { vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY); paddr = VM_LOWMEM_BOUNDARY; } #endif #ifdef VM_FREELIST_DMA32 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) { vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY); paddr = VM_DMA32_BOUNDARY; } #endif vm_phys_create_seg(paddr, end); } /* * Initialize the physical memory allocator. * * Requires that vm_page_array is initialized! */ void vm_phys_init(void) { struct vm_freelist *fl; struct vm_phys_seg *seg; u_long npages; int dom, flind, freelist, oind, pind, segind; /* * Compute the number of free lists, and generate the mapping from the * manifest constants VM_FREELIST_* to the free list indices. * * Initially, the entries of vm_freelist_to_flind[] are set to either * 0 or 1 to indicate which free lists should be created. */ npages = 0; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1; else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1; else #endif #ifdef VM_FREELIST_DMA32 if ( #ifdef VM_DMA32_NPAGES_THRESHOLD /* * Create the DMA32 free list only if the amount of * physical memory above physical address 4G exceeds the * given threshold. */ npages > VM_DMA32_NPAGES_THRESHOLD && #endif seg->end <= VM_DMA32_BOUNDARY) vm_freelist_to_flind[VM_FREELIST_DMA32] = 1; else #endif { npages += atop(seg->end - seg->start); vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1; } } /* Change each entry into a running total of the free lists. */ for (freelist = 1; freelist < VM_NFREELIST; freelist++) { vm_freelist_to_flind[freelist] += vm_freelist_to_flind[freelist - 1]; } vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1]; KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists")); /* Change each entry into a free list index. */ for (freelist = 0; freelist < VM_NFREELIST; freelist++) vm_freelist_to_flind[freelist]--; /* * Initialize the first_page and free_queues fields of each physical * memory segment. */ #ifdef VM_PHYSSEG_SPARSE npages = 0; #endif for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; #ifdef VM_PHYSSEG_SPARSE seg->first_page = &vm_page_array[npages]; npages += atop(seg->end - seg->start); #else seg->first_page = PHYS_TO_VM_PAGE(seg->start); #endif #ifdef VM_FREELIST_ISADMA if (seg->end <= VM_ISADMA_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_ISADMA]; KASSERT(flind >= 0, ("vm_phys_init: ISADMA flind < 0")); } else #endif #ifdef VM_FREELIST_LOWMEM if (seg->end <= VM_LOWMEM_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM]; KASSERT(flind >= 0, ("vm_phys_init: LOWMEM flind < 0")); } else #endif #ifdef VM_FREELIST_DMA32 if (seg->end <= VM_DMA32_BOUNDARY) { flind = vm_freelist_to_flind[VM_FREELIST_DMA32]; KASSERT(flind >= 0, ("vm_phys_init: DMA32 flind < 0")); } else #endif { flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT]; KASSERT(flind >= 0, ("vm_phys_init: DEFAULT flind < 0")); } seg->free_queues = &vm_phys_free_queues[seg->domain][flind]; } /* * Initialize the free queues. */ for (dom = 0; dom < vm_ndomains; dom++) { for (flind = 0; flind < vm_nfreelists; flind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; for (oind = 0; oind < VM_NFREEORDER; oind++) TAILQ_INIT(&fl[oind].pl); } } } rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } /* * Split a contiguous, power of two-sized set of physical pages. */ static __inline void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order) { vm_page_t m_buddy; while (oind > order) { oind--; m_buddy = &m[1 << oind]; KASSERT(m_buddy->order == VM_NFREEORDER, ("vm_phys_split_pages: page %p has unexpected order %d", m_buddy, m_buddy->order)); vm_freelist_add(fl, m_buddy, oind, 0); } } /* * Initialize a physical page and add it to the free lists. */ void vm_phys_add_page(vm_paddr_t pa) { vm_page_t m; struct vm_domain *vmd; vm_cnt.v_page_count++; m = vm_phys_paddr_to_vm_page(pa); m->busy_lock = VPB_UNBUSIED; m->phys_addr = pa; m->queue = PQ_NONE; m->segind = vm_phys_paddr_to_segind(pa); vmd = vm_phys_domain(m); vmd->vmd_page_count++; vmd->vmd_segs |= 1UL << m->segind; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_add_page: page %p has unexpected order %d", m, m->order)); m->pool = VM_FREEPOOL_DEFAULT; pmap_page_init(m); mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m, 0); mtx_unlock(&vm_page_queue_free_mtx); } /* * Allocate a contiguous, power of two-sized set of physical pages * from the free lists. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_pages(int pool, int order) { vm_page_t m; int domain, flind; struct vm_domain_iterator vi; KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_pages: order %d is out of range", order)); vm_policy_iterator_init(&vi); while ((vm_domain_iterator_run(&vi, &domain)) == 0) { for (flind = 0; flind < vm_nfreelists; flind++) { m = vm_phys_alloc_domain_pages(domain, flind, pool, order); if (m != NULL) return (m); } } vm_policy_iterator_finish(&vi); return (NULL); } /* * Allocate a contiguous, power of two-sized set of physical pages from the * specified free list. The free list must be specified using one of the * manifest constants VM_FREELIST_*. * * The free page queues must be locked. */ vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order) { vm_page_t m; struct vm_domain_iterator vi; int domain; KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", freelist)); KASSERT(pool < VM_NFREEPOOL, ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); vm_policy_iterator_init(&vi); while ((vm_domain_iterator_run(&vi, &domain)) == 0) { m = vm_phys_alloc_domain_pages(domain, vm_freelist_to_flind[freelist], pool, order); if (m != NULL) return (m); } vm_policy_iterator_finish(&vi); return (NULL); } static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order) { struct vm_freelist *fl; struct vm_freelist *alt; int oind, pind; vm_page_t m; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); fl = &vm_phys_free_queues[domain][flind][pool][0]; for (oind = order; oind < VM_NFREEORDER; oind++) { m = TAILQ_FIRST(&fl[oind].pl); if (m != NULL) { vm_freelist_rem(fl, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } /* * The given pool was empty. Find the largest * contiguous, power-of-two-sized set of pages in any * pool. Transfer these pages to the given pool, and * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { vm_freelist_rem(alt, m, oind); vm_phys_set_pool(pool, m, oind); vm_phys_split_pages(m, oind, fl, order); return (m); } } } return (NULL); } /* * Find the vm_page corresponding to the given physical address. */ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (&seg->first_page[atop(pa - seg->start)]); } return (NULL); } vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa) { struct vm_phys_fictitious_seg tmp, *seg; vm_page_t m; m = NULL; tmp.start = pa; tmp.end = 0; rw_rlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); rw_runlock(&vm_phys_fictitious_reg_lock); if (seg == NULL) return (NULL); m = &seg->first_page[atop(pa - seg->start)]; KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m)); return (m); } static inline void vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start, long page_count, vm_memattr_t memattr) { long i; for (i = 0; i < page_count; i++) { vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr); range[i].oflags &= ~VPO_UNMANAGED; range[i].busy_lock = VPB_UNBUSIED; } } int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end, vm_memattr_t memattr) { struct vm_phys_fictitious_seg *seg; vm_page_t fp; long page_count; #ifdef VM_PHYSSEG_DENSE long pi, pe; long dpage_count; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); page_count = (end - start) / PAGE_SIZE; #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { fp = &vm_page_array[pi - first_page]; if ((pe - first_page) > vm_page_array_size) { /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Use vm_page_array pages for those that are * inside of the vm_page_array range, and * allocate the remaining ones. */ dpage_count = vm_page_array_size - (pi - first_page); vm_phys_fictitious_init_range(fp, start, dpage_count, memattr); page_count -= dpage_count; start += ptoa(dpage_count); goto alloc; } /* * We can allocate the full range from vm_page_array, * so there's no need to register the range in the tree. */ vm_phys_fictitious_init_range(fp, start, page_count, memattr); return (0); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ fp = &vm_page_array[0]; dpage_count = pe - first_page; vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count, memattr); end -= ptoa(dpage_count); page_count -= dpage_count; goto alloc; } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* * Trying to register a fictitious range that expands before * and after vm_page_array. */ return (EINVAL); } else { alloc: #endif fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES, M_WAITOK | M_ZERO); #ifdef VM_PHYSSEG_DENSE } #endif vm_phys_fictitious_init_range(fp, start, page_count, memattr); seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO); seg->start = start; seg->end = end; seg->first_page = fp; rw_wlock(&vm_phys_fictitious_reg_lock); RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); return (0); } void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end) { struct vm_phys_fictitious_seg *seg, tmp; #ifdef VM_PHYSSEG_DENSE long pi, pe; #endif KASSERT(start < end, ("Start of segment isn't less than end (start: %jx end: %jx)", (uintmax_t)start, (uintmax_t)end)); #ifdef VM_PHYSSEG_DENSE pi = atop(start); pe = atop(end); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { if ((pe - first_page) <= vm_page_array_size) { /* * This segment was allocated using vm_page_array * only, there's nothing to do since those pages * were never added to the tree. */ return; } /* * We have a segment that starts inside * of vm_page_array, but ends outside of it. * * Calculate how many pages were added to the * tree and free them. */ start = ptoa(first_page + vm_page_array_size); } else if (pe > first_page && (pe - first_page) < vm_page_array_size) { /* * We have a segment that ends inside of vm_page_array, * but starts outside of it. */ end = ptoa(first_page); } else if (pi < first_page && pe > (first_page + vm_page_array_size)) { /* Since it's not possible to register such a range, panic. */ panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } #endif tmp.start = start; tmp.end = 0; rw_wlock(&vm_phys_fictitious_reg_lock); seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp); if (seg->start != start || seg->end != end) { rw_wunlock(&vm_phys_fictitious_reg_lock); panic( "Unregistering not registered fictitious range [%#jx:%#jx]", (uintmax_t)start, (uintmax_t)end); } RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg); rw_wunlock(&vm_phys_fictitious_reg_lock); free(seg->first_page, M_FICT_PAGES); free(seg, M_FICT_PAGES); } /* * Find the segment containing the given physical address. */ static int vm_phys_paddr_to_segind(vm_paddr_t pa) { struct vm_phys_seg *seg; int segind; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (pa >= seg->start && pa < seg->end) return (segind); } panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" , (uintmax_t)pa); } /* * Free a contiguous, power of two-sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_pages(vm_page_t m, int order) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa; vm_page_t m_buddy; KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); KASSERT(m->pool < VM_NFREEPOOL, ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_free_pages: order %d is out of range", order)); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); seg = &vm_phys_segs[m->segind]; if (order < VM_NFREEORDER - 1) { pa = VM_PAGE_TO_PHYS(m); do { pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order)); if (pa < seg->start || pa >= seg->end) break; m_buddy = &seg->first_page[atop(pa - seg->start)]; if (m_buddy->order != order) break; fl = (*seg->free_queues)[m_buddy->pool]; vm_freelist_rem(fl, m_buddy, order); if (m_buddy->pool != m->pool) vm_phys_set_pool(m->pool, m_buddy, order); order++; pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1); m = &seg->first_page[atop(pa - seg->start)]; } while (order < VM_NFREEORDER - 1); } fl = (*seg->free_queues)[m->pool]; vm_freelist_add(fl, m, order, 1); } /* * Free a contiguous, arbitrarily sized set of physical pages. * * The free page queues must be locked. */ void vm_phys_free_contig(vm_page_t m, u_long npages) { u_int n; int order; /* * Avoid unnecessary coalescing by freeing the pages in the largest * possible power-of-two-sized subsets. */ mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;; npages -= n) { /* * Unsigned "min" is used here so that "order" is assigned * "VM_NFREEORDER - 1" when "m"'s physical address is zero * or the low-order bits of its physical address are zero * because the size of a physical address exceeds the size of * a long. */ order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1, VM_NFREEORDER - 1); n = 1 << order; if (npages < n) break; vm_phys_free_pages(m, order); m += n; } /* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */ for (; npages > 0; npages -= n) { order = flsl(npages) - 1; n = 1 << order; vm_phys_free_pages(m, order); m += n; } } /* * Scan physical memory between the specified addresses "low" and "high" for a * run of contiguous physical pages that satisfy the specified conditions, and * return the lowest page in the run. The specified "alignment" determines * the alignment of the lowest physical page in the run. If the specified * "boundary" is non-zero, then the run of physical pages cannot span a * physical address that is a multiple of "boundary". * * "npages" must be greater than zero. Both "alignment" and "boundary" must * be a power of two. */ vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options) { vm_paddr_t pa_end; vm_page_t m_end, m_run, m_start; struct vm_phys_seg *seg; int segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); if (low >= high) return (NULL); for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; if (seg->start >= high) break; if (low >= seg->end) continue; if (low <= seg->start) m_start = seg->first_page; else m_start = &seg->first_page[atop(low - seg->start)]; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages)) continue; m_end = &seg->first_page[atop(pa_end - seg->start)]; m_run = vm_page_scan_contig(npages, m_start, m_end, alignment, boundary, options); if (m_run != NULL) return (m_run); } return (NULL); } /* * Set the pool for a contiguous, power of two-sized set of physical pages. */ void vm_phys_set_pool(int pool, vm_page_t m, int order) { vm_page_t m_tmp; for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) m_tmp->pool = pool; } /* * Search for the given physical page "m" in the free lists. If the search * succeeds, remove "m" from the free lists and return TRUE. Otherwise, return * FALSE, indicating that "m" is not in the free lists. * * The free page queues must be locked. */ boolean_t vm_phys_unfree_page(vm_page_t m) { struct vm_freelist *fl; struct vm_phys_seg *seg; vm_paddr_t pa, pa_half; vm_page_t m_set, m_tmp; int order; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ seg = &vm_phys_segs[m->segind]; for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order)); if (pa >= seg->start) m_set = &seg->first_page[atop(pa - seg->start)]; else return (FALSE); } if (m_set->order < order) return (FALSE); if (m_set->order == VM_NFREEORDER) return (FALSE); KASSERT(m_set->order < VM_NFREEORDER, ("vm_phys_unfree_page: page %p has unexpected order %d", m_set, m_set->order)); /* * Next, remove "m_set" from the free lists. Finally, extract * "m" from "m_set" using an iterative algorithm: While "m_set" * is larger than a page, shrink "m_set" by returning the half * of "m_set" that does not contain "m" to the free lists. */ fl = (*seg->free_queues)[m_set->pool]; order = m_set->order; vm_freelist_rem(fl, m_set, order); while (order > 0) { order--; pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order)); if (m->phys_addr < pa_half) m_tmp = &seg->first_page[atop(pa_half - seg->start)]; else { m_tmp = m_set; m_set = &seg->first_page[atop(pa_half - seg->start)]; } vm_freelist_add(fl, m_tmp, order, 0); } KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency")); return (TRUE); } /* * Try to zero one physical page. Used by an idle priority thread. */ boolean_t vm_phys_zero_pages_idle(void) { static struct vm_freelist *fl; static int flind, oind, pind; vm_page_t m, m_tmp; int domain; domain = vm_rr_selectdomain(); fl = vm_phys_free_queues[domain][0][0]; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); for (;;) { TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) { for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) { - if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) { + if ((m_tmp->flags & PG_ZERO) == 0) { vm_phys_unfree_page(m_tmp); vm_phys_freecnt_adj(m, -1); mtx_unlock(&vm_page_queue_free_mtx); pmap_zero_page_idle(m_tmp); m_tmp->flags |= PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); vm_phys_free_pages(m_tmp, 0); vm_page_zero_count++; cnt_prezero++; return (TRUE); } } } oind++; if (oind == VM_NFREEORDER) { oind = 0; pind++; if (pind == VM_NFREEPOOL) { pind = 0; flind++; if (flind == vm_nfreelists) flind = 0; } fl = vm_phys_free_queues[domain][flind][pind]; } } } /* * Allocate a contiguous set of physical pages of the given size * "npages" from the free lists. All of the physical pages must be at * or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross * any physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa_end, pa_start; vm_page_t m_run; struct vm_domain_iterator vi; struct vm_phys_seg *seg; int domain, segind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (low >= high) return (NULL); vm_policy_iterator_init(&vi); restartdom: if (vm_domain_iterator_run(&vi, &domain) != 0) { vm_policy_iterator_finish(&vi); return (NULL); } m_run = NULL; for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) { seg = &vm_phys_segs[segind]; if (seg->start >= high || seg->domain != domain) continue; if (low >= seg->end) break; if (low <= seg->start) pa_start = seg->start; else pa_start = low; if (high < seg->end) pa_end = high; else pa_end = seg->end; if (pa_end - pa_start < ptoa(npages)) continue; m_run = vm_phys_alloc_seg_contig(seg, npages, low, high, alignment, boundary); if (m_run != NULL) break; } if (m_run == NULL && !vm_domain_iterator_isdone(&vi)) goto restartdom; vm_policy_iterator_finish(&vi); return (m_run); } /* * Allocate a run of contiguous physical pages from the free list for the * specified segment. */ static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_freelist *fl; vm_paddr_t pa, pa_end, size; vm_page_t m, m_ret; u_long npages_end; int oind, order, pind; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* Compute the queue that is the best fit for npages. */ for (order = 0; (1 << order) < npages; order++); /* Search for a run satisfying the specified conditions. */ size = npages << PAGE_SHIFT; for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) { for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = (*seg->free_queues)[pind]; TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) { /* * Is the size of this allocation request * larger than the largest block size? */ if (order >= VM_NFREEORDER) { /* * Determine if a sufficient number of * subsequent blocks to satisfy the * allocation request are free. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; for (;;) { pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1); if (pa >= pa_end || pa < seg->start || pa >= seg->end) break; m = &seg->first_page[atop(pa - seg->start)]; if (m->order != VM_NFREEORDER - 1) break; } /* If not, go to the next block. */ if (pa < pa_end) continue; } /* * Determine if the blocks are within the * given range, satisfy the given alignment, * and do not cross the given boundary. */ pa = VM_PAGE_TO_PHYS(m_ret); pa_end = pa + size; if (pa >= low && pa_end <= high && (pa & (alignment - 1)) == 0 && rounddown2(pa ^ (pa_end - 1), boundary) == 0) goto done; } } } return (NULL); done: for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) { fl = (*seg->free_queues)[m->pool]; vm_freelist_rem(fl, m, m->order); } if (m_ret->pool != VM_FREEPOOL_DEFAULT) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind); fl = (*seg->free_queues)[m_ret->pool]; vm_phys_split_pages(m_ret, oind, fl, order); /* Return excess pages to the free lists. */ npages_end = roundup2(npages, 1 << imin(oind, order)); if (npages < npages_end) vm_phys_free_contig(&m_ret[npages], npages_end - npages); return (m_ret); } #ifdef DDB /* * Show the number of physical pages in each of the free lists. */ DB_SHOW_COMMAND(freepages, db_show_freepages) { struct vm_freelist *fl; int flind, oind, pind, dom; for (dom = 0; dom < vm_ndomains; dom++) { db_printf("DOMAIN: %d\n", dom); for (flind = 0; flind < vm_nfreelists; flind++) { db_printf("FREE LIST %d:\n" "\n ORDER (SIZE) | NUMBER" "\n ", flind); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf(" | POOL %d", pind); db_printf("\n-- "); for (pind = 0; pind < VM_NFREEPOOL; pind++) db_printf("-- -- "); db_printf("--\n"); for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { db_printf(" %2.2d (%6.6dK)", oind, 1 << (PAGE_SHIFT - 10 + oind)); for (pind = 0; pind < VM_NFREEPOOL; pind++) { fl = vm_phys_free_queues[dom][flind][pind]; db_printf(" | %6.6d", fl[oind].lcnt); } db_printf("\n"); } db_printf("\n"); } db_printf("\n"); } } #endif Index: stable/11/sys/vm/vm_radix.c =================================================================== --- stable/11/sys/vm/vm_radix.c (revision 318715) +++ stable/11/sys/vm/vm_radix.c (revision 318716) @@ -1,854 +1,801 @@ /* * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Path-compressed radix trie implementation. * The following code is not generalized into a general purpose library * because there are way too many parameters embedded that should really * be decided by the library consumers. At the same time, consumers * of this code must achieve highest possible performance. * * The implementation takes into account the following rationale: * - Size of the nodes should be as small as possible but still big enough * to avoid a large maximum depth for the trie. This is a balance * between the necessity to not wire too much physical memory for the nodes * and the necessity to avoid too much cache pollution during the trie * operations. * - There is not a huge bias toward the number of lookup operations over * the number of insert and remove operations. This basically implies * that optimizations supposedly helping one operation but hurting the * other might be carefully evaluated. * - On average not many nodes are expected to be fully populated, hence * level compression may just complicate things. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * These widths should allow the pointers to a node's children to fit within * a single cache line. The extra levels from a narrow width should not be * a problem thanks to path compression. */ #ifdef __LP64__ #define VM_RADIX_WIDTH 4 #else #define VM_RADIX_WIDTH 3 #endif #define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) #define VM_RADIX_MASK (VM_RADIX_COUNT - 1) #define VM_RADIX_LIMIT \ (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1) /* Flag bits stored in node pointers. */ #define VM_RADIX_ISLEAF 0x1 #define VM_RADIX_FLAGS 0x1 #define VM_RADIX_PAD VM_RADIX_FLAGS /* Returns one unit associated with specified level. */ #define VM_RADIX_UNITLEVEL(lev) \ ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH)) struct vm_radix_node { vm_pindex_t rn_owner; /* Owner of record. */ uint16_t rn_count; /* Valid children. */ uint16_t rn_clev; /* Current level. */ void *rn_child[VM_RADIX_COUNT]; /* Child nodes. */ }; static uma_zone_t vm_radix_node_zone; /* * Allocate a radix node. */ static __inline struct vm_radix_node * vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel) { struct vm_radix_node *rnode; rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO); if (rnode == NULL) return (NULL); rnode->rn_owner = owner; rnode->rn_count = count; rnode->rn_clev = clevel; return (rnode); } /* * Free radix node. */ static __inline void vm_radix_node_put(struct vm_radix_node *rnode) { uma_zfree(vm_radix_node_zone, rnode); } /* * Return the position in the array for a given level. */ static __inline int vm_radix_slot(vm_pindex_t index, uint16_t level) { return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); } /* Trims the key after the specified level. */ static __inline vm_pindex_t vm_radix_trimkey(vm_pindex_t index, uint16_t level) { vm_pindex_t ret; ret = index; if (level > 0) { ret >>= level * VM_RADIX_WIDTH; ret <<= level * VM_RADIX_WIDTH; } return (ret); } /* * Get the root node for a radix tree. */ static __inline struct vm_radix_node * vm_radix_getroot(struct vm_radix *rtree) { return ((struct vm_radix_node *)rtree->rt_root); } /* * Set the root node for a radix tree. */ static __inline void vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode) { rtree->rt_root = (uintptr_t)rnode; } /* * Returns TRUE if the specified radix node is a leaf and FALSE otherwise. */ static __inline boolean_t vm_radix_isleaf(struct vm_radix_node *rnode) { return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0); } /* * Returns the associated page extracted from rnode. */ static __inline vm_page_t vm_radix_topage(struct vm_radix_node *rnode) { return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS)); } /* * Adds the page as a child of the provided node. */ static __inline void vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, vm_page_t page) { int slot; slot = vm_radix_slot(index, clev); rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF); } /* * Returns the slot where two keys differ. * It cannot accept 2 equal keys. */ static __inline uint16_t vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) { uint16_t clev; KASSERT(index1 != index2, ("%s: passing the same key value %jx", __func__, (uintmax_t)index1)); index1 ^= index2; for (clev = VM_RADIX_LIMIT;; clev--) if (vm_radix_slot(index1, clev) != 0) return (clev); } /* * Returns TRUE if it can be determined that key does not belong to the * specified rnode. Otherwise, returns FALSE. */ static __inline boolean_t vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) { if (rnode->rn_clev < VM_RADIX_LIMIT) { idx = vm_radix_trimkey(idx, rnode->rn_clev + 1); return (idx != rnode->rn_owner); } return (FALSE); } /* * Internal helper for vm_radix_reclaim_allnodes(). * This function is recursive. */ static void vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) { int slot; KASSERT(rnode->rn_count <= VM_RADIX_COUNT, ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode)); for (slot = 0; rnode->rn_count != 0; slot++) { if (rnode->rn_child[slot] == NULL) continue; if (!vm_radix_isleaf(rnode->rn_child[slot])) vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]); rnode->rn_child[slot] = NULL; rnode->rn_count--; } vm_radix_node_put(rnode); } #ifdef INVARIANTS /* * Radix node zone destructor. */ static void vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused) { struct vm_radix_node *rnode; int slot; rnode = mem; KASSERT(rnode->rn_count == 0, ("vm_radix_node_put: rnode %p has %d children", rnode, rnode->rn_count)); for (slot = 0; slot < VM_RADIX_COUNT; slot++) KASSERT(rnode->rn_child[slot] == NULL, ("vm_radix_node_put: rnode %p has a child", rnode)); } #endif #ifndef UMA_MD_SMALL_ALLOC /* * Reserve the KVA necessary to satisfy the node allocation. * This is mandatory in architectures not supporting direct * mapping as they will need otherwise to carve into the kernel maps for * every node allocation, resulting into deadlocks for consumers already * working with kernel maps. */ static void vm_radix_reserve_kva(void *arg __unused) { /* * Calculate the number of reserved nodes, discounting the pages that * are needed to store them. */ if (!uma_zone_reserve_kva(vm_radix_node_zone, ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + sizeof(struct vm_radix_node)))) panic("%s: unable to reserve KVA", __func__); } SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_THIRD, vm_radix_reserve_kva, NULL); #endif /* * Initialize the UMA slab zone. */ void vm_radix_init(void) { vm_radix_node_zone = uma_zcreate("RADIX NODE", sizeof(struct vm_radix_node), NULL, #ifdef INVARIANTS vm_radix_node_zone_dtor, #else NULL, #endif NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM); } /* * Inserts the key-value pair into the trie. * Panics if the key already exists. */ int vm_radix_insert(struct vm_radix *rtree, vm_page_t page) { vm_pindex_t index, newind; void **parentp; struct vm_radix_node *rnode, *tmp; vm_page_t m; int slot; uint16_t clev; index = page->pindex; -restart: - /* * The owner of record for root is not really important because it * will never be used. */ rnode = vm_radix_getroot(rtree); if (rnode == NULL) { rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF; return (0); } parentp = (void **)&rtree->rt_root; for (;;) { if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex == index) panic("%s: key %jx is already present", __func__, (uintmax_t)index); clev = vm_radix_keydiff(m->pindex, index); - - /* - * During node allocation the trie that is being - * walked can be modified because of recursing radix - * trie operations. - * If this is the case, the recursing functions signal - * such situation and the insert operation must - * start from scratch again. - * The freed radix node will then be in the UMA - * caches very likely to avoid the same situation - * to happen. - */ - rtree->rt_flags |= RT_INSERT_INPROG; tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); - rtree->rt_flags &= ~RT_INSERT_INPROG; - if (tmp == NULL) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; + if (tmp == NULL) return (ENOMEM); - } - if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; - tmp->rn_count = 0; - vm_radix_node_put(tmp); - goto restart; - } *parentp = tmp; vm_radix_addpage(tmp, index, clev, page); vm_radix_addpage(tmp, m->pindex, clev, m); return (0); } else if (vm_radix_keybarr(rnode, index)) break; slot = vm_radix_slot(index, rnode->rn_clev); if (rnode->rn_child[slot] == NULL) { rnode->rn_count++; vm_radix_addpage(rnode, index, rnode->rn_clev, page); return (0); } parentp = &rnode->rn_child[slot]; rnode = rnode->rn_child[slot]; } /* * A new node is needed because the right insertion level is reached. * Setup the new intermediate node and add the 2 children: the * new object and the older edge. */ newind = rnode->rn_owner; clev = vm_radix_keydiff(newind, index); - - /* See the comments above. */ - rtree->rt_flags |= RT_INSERT_INPROG; tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); - rtree->rt_flags &= ~RT_INSERT_INPROG; - if (tmp == NULL) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; + if (tmp == NULL) return (ENOMEM); - } - if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) { - rtree->rt_flags &= ~RT_TRIE_MODIFIED; - tmp->rn_count = 0; - vm_radix_node_put(tmp); - goto restart; - } *parentp = tmp; vm_radix_addpage(tmp, index, clev, page); slot = vm_radix_slot(newind, clev); tmp->rn_child[slot] = rnode; return (0); } /* * Returns TRUE if the specified radix tree contains a single leaf and FALSE * otherwise. */ boolean_t vm_radix_is_singleton(struct vm_radix *rtree) { struct vm_radix_node *rnode; rnode = vm_radix_getroot(rtree); if (rnode == NULL) return (FALSE); return (vm_radix_isleaf(rnode)); } /* * Returns the value stored at the index. If the index is not present, * NULL is returned. */ vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *rnode; vm_page_t m; int slot; rnode = vm_radix_getroot(rtree); while (rnode != NULL) { if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex == index) return (m); else break; } else if (vm_radix_keybarr(rnode, index)) break; slot = vm_radix_slot(index, rnode->rn_clev); rnode = rnode->rn_child[slot]; } return (NULL); } /* * Look up the nearest entry at a position bigger than or equal to index. */ vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *stack[VM_RADIX_LIMIT]; vm_pindex_t inc; vm_page_t m; struct vm_radix_node *child, *rnode; #ifdef INVARIANTS int loops = 0; #endif int slot, tos; rnode = vm_radix_getroot(rtree); if (rnode == NULL) return (NULL); else if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex >= index) return (m); else return (NULL); } tos = 0; for (;;) { /* * If the keys differ before the current bisection node, * then the search key might rollback to the earliest * available bisection node or to the smallest key * in the current node (if the owner is bigger than the * search key). */ if (vm_radix_keybarr(rnode, index)) { if (index > rnode->rn_owner) { ascend: KASSERT(++loops < 1000, ("vm_radix_lookup_ge: too many loops")); /* * Pop nodes from the stack until either the * stack is empty or a node that could have a * matching descendant is found. */ do { if (tos == 0) return (NULL); rnode = stack[--tos]; } while (vm_radix_slot(index, rnode->rn_clev) == (VM_RADIX_COUNT - 1)); /* * The following computation cannot overflow * because index's slot at the current level * is less than VM_RADIX_COUNT - 1. */ index = vm_radix_trimkey(index, rnode->rn_clev); index += VM_RADIX_UNITLEVEL(rnode->rn_clev); } else index = rnode->rn_owner; KASSERT(!vm_radix_keybarr(rnode, index), ("vm_radix_lookup_ge: keybarr failed")); } slot = vm_radix_slot(index, rnode->rn_clev); child = rnode->rn_child[slot]; if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex >= index) return (m); } else if (child != NULL) goto descend; /* * Look for an available edge or page within the current * bisection node. */ if (slot < (VM_RADIX_COUNT - 1)) { inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); index = vm_radix_trimkey(index, rnode->rn_clev); do { index += inc; slot++; child = rnode->rn_child[slot]; if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex >= index) return (m); } else if (child != NULL) goto descend; } while (slot < (VM_RADIX_COUNT - 1)); } KASSERT(child == NULL || vm_radix_isleaf(child), ("vm_radix_lookup_ge: child is radix node")); /* * If a page or edge bigger than the search slot is not found * in the current node, ascend to the next higher-level node. */ goto ascend; descend: KASSERT(rnode->rn_clev > 0, ("vm_radix_lookup_ge: pushing leaf's parent")); KASSERT(tos < VM_RADIX_LIMIT, ("vm_radix_lookup_ge: stack overflow")); stack[tos++] = rnode; rnode = child; } } /* * Look up the nearest entry at a position less than or equal to index. */ vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *stack[VM_RADIX_LIMIT]; vm_pindex_t inc; vm_page_t m; struct vm_radix_node *child, *rnode; #ifdef INVARIANTS int loops = 0; #endif int slot, tos; rnode = vm_radix_getroot(rtree); if (rnode == NULL) return (NULL); else if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex <= index) return (m); else return (NULL); } tos = 0; for (;;) { /* * If the keys differ before the current bisection node, * then the search key might rollback to the earliest * available bisection node or to the largest key * in the current node (if the owner is smaller than the * search key). */ if (vm_radix_keybarr(rnode, index)) { if (index > rnode->rn_owner) { index = rnode->rn_owner + VM_RADIX_COUNT * VM_RADIX_UNITLEVEL(rnode->rn_clev); } else { ascend: KASSERT(++loops < 1000, ("vm_radix_lookup_le: too many loops")); /* * Pop nodes from the stack until either the * stack is empty or a node that could have a * matching descendant is found. */ do { if (tos == 0) return (NULL); rnode = stack[--tos]; } while (vm_radix_slot(index, rnode->rn_clev) == 0); /* * The following computation cannot overflow * because index's slot at the current level * is greater than 0. */ index = vm_radix_trimkey(index, rnode->rn_clev); } index--; KASSERT(!vm_radix_keybarr(rnode, index), ("vm_radix_lookup_le: keybarr failed")); } slot = vm_radix_slot(index, rnode->rn_clev); child = rnode->rn_child[slot]; if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex <= index) return (m); } else if (child != NULL) goto descend; /* * Look for an available edge or page within the current * bisection node. */ if (slot > 0) { inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); index |= inc - 1; do { index -= inc; slot--; child = rnode->rn_child[slot]; if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex <= index) return (m); } else if (child != NULL) goto descend; } while (slot > 0); } KASSERT(child == NULL || vm_radix_isleaf(child), ("vm_radix_lookup_le: child is radix node")); /* * If a page or edge smaller than the search slot is not found * in the current node, ascend to the next higher-level node. */ goto ascend; descend: KASSERT(rnode->rn_clev > 0, ("vm_radix_lookup_le: pushing leaf's parent")); KASSERT(tos < VM_RADIX_LIMIT, ("vm_radix_lookup_le: stack overflow")); stack[tos++] = rnode; rnode = child; } } /* - * Remove the specified index from the tree. - * Panics if the key is not present. + * Remove the specified index from the trie, and return the value stored at + * that index. If the index is not present, return NULL. */ -void +vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *rnode, *parent; vm_page_t m; int i, slot; - /* - * Detect if a page is going to be removed from a trie which is - * already undergoing another trie operation. - * Right now this is only possible for vm_radix_remove() recursing - * into vm_radix_insert(). - * If this is the case, the caller must be notified about this - * situation. It will also takecare to update the RT_TRIE_MODIFIED - * accordingly. - * The RT_TRIE_MODIFIED bit is set here because the remove operation - * will always succeed. - */ - if ((rtree->rt_flags & RT_INSERT_INPROG) != 0) - rtree->rt_flags |= RT_TRIE_MODIFIED; - rnode = vm_radix_getroot(rtree); if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex != index) - panic("%s: invalid key found", __func__); + return (NULL); vm_radix_setroot(rtree, NULL); - return; + return (m); } parent = NULL; for (;;) { if (rnode == NULL) - panic("vm_radix_remove: impossible to locate the key"); + return (NULL); slot = vm_radix_slot(index, rnode->rn_clev); if (vm_radix_isleaf(rnode->rn_child[slot])) { m = vm_radix_topage(rnode->rn_child[slot]); if (m->pindex != index) - panic("%s: invalid key found", __func__); + return (NULL); rnode->rn_child[slot] = NULL; rnode->rn_count--; if (rnode->rn_count > 1) - break; + return (m); for (i = 0; i < VM_RADIX_COUNT; i++) if (rnode->rn_child[i] != NULL) break; KASSERT(i != VM_RADIX_COUNT, ("%s: invalid node configuration", __func__)); if (parent == NULL) vm_radix_setroot(rtree, rnode->rn_child[i]); else { slot = vm_radix_slot(index, parent->rn_clev); KASSERT(parent->rn_child[slot] == rnode, ("%s: invalid child value", __func__)); parent->rn_child[slot] = rnode->rn_child[i]; } rnode->rn_count--; rnode->rn_child[i] = NULL; vm_radix_node_put(rnode); - break; + return (m); } parent = rnode; rnode = rnode->rn_child[slot]; } } /* * Remove and free all the nodes from the radix tree. * This function is recursive but there is a tight control on it as the * maximum depth of the tree is fixed. */ void vm_radix_reclaim_allnodes(struct vm_radix *rtree) { struct vm_radix_node *root; - - KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0, - ("vm_radix_reclaim_allnodes: unexpected trie recursion")); root = vm_radix_getroot(rtree); if (root == NULL) return; vm_radix_setroot(rtree, NULL); if (!vm_radix_isleaf(root)) vm_radix_reclaim_allnodes_int(root); } /* * Replace an existing page in the trie with another one. * Panics if there is not an old page in the trie at the new page's index. */ vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage) { struct vm_radix_node *rnode; vm_page_t m; vm_pindex_t index; int slot; index = newpage->pindex; rnode = vm_radix_getroot(rtree); if (rnode == NULL) panic("%s: replacing page on an empty trie", __func__); if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex != index) panic("%s: original replacing root key not found", __func__); rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF; return (m); } for (;;) { slot = vm_radix_slot(index, rnode->rn_clev); if (vm_radix_isleaf(rnode->rn_child[slot])) { m = vm_radix_topage(rnode->rn_child[slot]); if (m->pindex == index) { rnode->rn_child[slot] = (void *)((uintptr_t)newpage | VM_RADIX_ISLEAF); return (m); } else break; } else if (rnode->rn_child[slot] == NULL || vm_radix_keybarr(rnode->rn_child[slot], index)) break; rnode = rnode->rn_child[slot]; } panic("%s: original replacing page not found", __func__); } #ifdef DDB /* * Show details about the given radix node. */ DB_SHOW_COMMAND(radixnode, db_show_radixnode) { struct vm_radix_node *rnode; int i; if (!have_addr) return; rnode = (struct vm_radix_node *)addr; db_printf("radixnode %p, owner %jx, children count %u, level %u:\n", (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count, rnode->rn_clev); for (i = 0; i < VM_RADIX_COUNT; i++) if (rnode->rn_child[i] != NULL) db_printf("slot: %d, val: %p, page: %p, clev: %d\n", i, (void *)rnode->rn_child[i], vm_radix_isleaf(rnode->rn_child[i]) ? vm_radix_topage(rnode->rn_child[i]) : NULL, rnode->rn_clev); } #endif /* DDB */ Index: stable/11/sys/vm/vm_radix.h =================================================================== --- stable/11/sys/vm/vm_radix.h (revision 318715) +++ stable/11/sys/vm/vm_radix.h (revision 318716) @@ -1,49 +1,49 @@ /* * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VM_RADIX_H_ #define _VM_RADIX_H_ #include #ifdef _KERNEL void vm_radix_init(void); int vm_radix_insert(struct vm_radix *rtree, vm_page_t page); boolean_t vm_radix_is_singleton(struct vm_radix *rtree); vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index); void vm_radix_reclaim_allnodes(struct vm_radix *rtree); -void vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); +vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage); #endif /* _KERNEL */ #endif /* !_VM_RADIX_H_ */ Index: stable/11/sys/vm/vm_reserv.c =================================================================== --- stable/11/sys/vm/vm_reserv.c (revision 318715) +++ stable/11/sys/vm/vm_reserv.c (revision 318716) @@ -1,1158 +1,1123 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2011 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Superpage reservation management module * * Any external functions defined by this module are only to be used by the * virtual memory system. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The reservation system supports the speculative allocation of large physical - * pages ("superpages"). Speculative allocation enables the fully-automatic + * pages ("superpages"). Speculative allocation enables the fully automatic * utilization of superpages by the virtual memory system. In other words, no * programmatic directives are required to use superpages. */ #if VM_NRESERVLEVEL > 0 /* * The number of small pages that are contained in a level 0 reservation */ #define VM_LEVEL_0_NPAGES (1 << VM_LEVEL_0_ORDER) /* * The number of bits by which a physical address is shifted to obtain the * reservation number */ #define VM_LEVEL_0_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) /* * The size of a level 0 reservation in bytes */ #define VM_LEVEL_0_SIZE (1 << VM_LEVEL_0_SHIFT) /* * Computes the index of the small page underlying the given (object, pindex) * within the reservation's array of small pages. */ #define VM_RESERV_INDEX(object, pindex) \ (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1)) /* * The size of a population map entry */ typedef u_long popmap_t; /* * The number of bits in a population map entry */ #define NBPOPMAP (NBBY * sizeof(popmap_t)) /* * The number of population map entries in a reservation */ #define NPOPMAP howmany(VM_LEVEL_0_NPAGES, NBPOPMAP) /* * Clear a bit in the population map. */ static __inline void popmap_clear(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP)); } /* * Set a bit in the population map. */ static __inline void popmap_set(popmap_t popmap[], int i) { popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP); } /* * Is a bit in the population map clear? */ static __inline boolean_t popmap_is_clear(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0); } /* * Is a bit in the population map set? */ static __inline boolean_t popmap_is_set(popmap_t popmap[], int i) { return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0); } /* * The reservation structure * * A reservation structure is constructed whenever a large physical page is * speculatively allocated to an object. The reservation provides the small * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets * within that object. The reservation's "popcnt" tracks the number of these * small physical pages that are in use at any given time. When and if the - * reservation is not fully utilized, it appears in the queue of partially- + * reservation is not fully utilized, it appears in the queue of partially * populated reservations. The reservation always appears on the containing * object's list of reservations. * - * A partially-populated reservation can be broken and reclaimed at any time. + * A partially populated reservation can be broken and reclaimed at any time. */ struct vm_reserv { TAILQ_ENTRY(vm_reserv) partpopq; LIST_ENTRY(vm_reserv) objq; vm_object_t object; /* containing object */ vm_pindex_t pindex; /* offset within object */ vm_page_t pages; /* first page of a superpage */ int popcnt; /* # of pages in use */ char inpartpopq; popmap_t popmap[NPOPMAP]; /* bit vector of used pages */ }; /* * The reservation array * * This array is analoguous in function to vm_page_array. It differs in the * respect that it may contain a greater number of useful reservation * structures than there are (physical) superpages. These "invalid" * reservation structures exist to trade-off space for time in the * implementation of vm_reserv_from_page(). Invalid reservation structures are * distinguishable from "valid" reservation structures by inspecting the * reservation's "pages" field. Invalid reservation structures have a NULL * "pages" field. * * vm_reserv_from_page() maps a small (physical) page to an element of this * array by computing a physical reservation number from the page's physical * address. The physical reservation number is used as the array index. * * An "active" reservation is a valid reservation structure that has a non-NULL * "object" field and a non-zero "popcnt" field. In other words, every active * reservation belongs to a particular object. Moreover, every active * reservation has an entry in the containing object's list of reservations. */ static vm_reserv_t vm_reserv_array; /* - * The partially-populated reservation queue + * The partially populated reservation queue * - * This queue enables the fast recovery of an unused cached or free small page - * from a partially-populated reservation. The reservation at the head of - * this queue is the least-recently-changed, partially-populated reservation. + * This queue enables the fast recovery of an unused free small page from a + * partially populated reservation. The reservation at the head of this queue + * is the least recently changed, partially populated reservation. * * Access to this queue is synchronized by the free page queue lock. */ static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop = TAILQ_HEAD_INITIALIZER(vm_rvq_partpop); static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info"); static long vm_reserv_broken; SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD, &vm_reserv_broken, 0, "Cumulative number of broken reservations"); static long vm_reserv_freed; SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD, &vm_reserv_freed, 0, "Cumulative number of freed reservations"); static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vm_reserv_fullpop, "I", "Current number of full reservations"); static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, - sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues"); + sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues"); static long vm_reserv_reclaimed; SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD, &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations"); static void vm_reserv_break(vm_reserv_t rv, vm_page_t m); static void vm_reserv_depopulate(vm_reserv_t rv, int index); static vm_reserv_t vm_reserv_from_page(vm_page_t m); static boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex); static void vm_reserv_populate(vm_reserv_t rv, int index); static void vm_reserv_reclaim(vm_reserv_t rv); /* * Returns the current number of full reservations. * * Since the number of full reservations is computed without acquiring the * free page queue lock, the returned value may be inexact. */ static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS) { vm_paddr_t paddr; struct vm_phys_seg *seg; vm_reserv_t rv; int fullpop, segind; fullpop = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE <= seg->end) { rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT]; fullpop += rv->popcnt == VM_LEVEL_0_NPAGES; paddr += VM_LEVEL_0_SIZE; } } return (sysctl_handle_int(oidp, &fullpop, 0, req)); } /* - * Describes the current state of the partially-populated reservation queue. + * Describes the current state of the partially populated reservation queue. */ static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; vm_reserv_t rv; int counter, error, level, unused_pages; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sbuf_printf(&sbuf, "\nLEVEL SIZE NUMBER\n\n"); for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) { counter = 0; unused_pages = 0; mtx_lock(&vm_page_queue_free_mtx); TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) { counter++; unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt; } mtx_unlock(&vm_page_queue_free_mtx); sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level, unused_pages * ((int)PAGE_SIZE / 1024), counter); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Reduces the given reservation's population count. If the population count * becomes zero, the reservation is destroyed. Additionally, moves the - * reservation to the tail of the partially-populated reservation queue if the + * reservation to the tail of the partially populated reservation queue if the * population count is non-zero. * * The free page queue lock must be held. */ static void vm_reserv_depopulate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_depopulate: reserv %p is free", rv)); KASSERT(popmap_is_set(rv->popmap, index), ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv, index)); KASSERT(rv->popcnt > 0, ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv)); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } else { KASSERT(rv->pages->psind == 1, ("vm_reserv_depopulate: reserv %p is already demoted", rv)); rv->pages->psind = 0; } popmap_clear(rv->popmap, index); rv->popcnt--; if (rv->popcnt == 0) { LIST_REMOVE(rv, objq); rv->object = NULL; vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER); vm_reserv_freed++; } else { rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq); } } /* * Returns the reservation to which the given page might belong. */ static __inline vm_reserv_t vm_reserv_from_page(vm_page_t m) { return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]); } /* * Returns TRUE if the given reservation contains the given page index and * FALSE otherwise. */ static __inline boolean_t vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex) { return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0); } /* * Increases the given reservation's population count. Moves the reservation - * to the tail of the partially-populated reservation queue. + * to the tail of the partially populated reservation queue. * * The free page queue must be locked. */ static void vm_reserv_populate(vm_reserv_t rv, int index) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_populate: reserv %p is free", rv)); KASSERT(popmap_is_clear(rv->popmap, index), ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv, index)); KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES, ("vm_reserv_populate: reserv %p is already full", rv)); KASSERT(rv->pages->psind == 0, ("vm_reserv_populate: reserv %p is already promoted", rv)); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } popmap_set(rv->popmap, index); rv->popcnt++; if (rv->popcnt < VM_LEVEL_0_NPAGES) { rv->inpartpopq = TRUE; TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq); } else rv->pages->psind = 1; } /* * Allocates a contiguous set of physical pages of the given size "npages" * from existing or newly created reservations. All of the physical pages * must be at or above the given physical address "low" and below the given * physical address "high". The given value "alignment" determines the * alignment of the first physical page in the set. If the given value * "boundary" is non-zero, then the set of physical pages cannot cross any * physical address boundary that is a multiple of that value. Both * "alignment" and "boundary" must be a power of two. * + * The page "mpred" must immediately precede the offset "pindex" within the + * specified object. + * * The object and free page queue must be locked. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, - vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) + vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, + vm_page_t mpred) { vm_paddr_t pa, size; - vm_page_t m, m_ret, mpred, msucc; + vm_page_t m, m_ret, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; u_long allocpages, maxpages, minpages; int i, index, n; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0")); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex + npages > object->size) return (NULL); /* * All reservations of a particular size have the same alignment. * Assuming that the first page is allocated from a reservation, the * least significant bits of its physical address can be determined * from its offset from the beginning of the reservation and the size * of the reservation. * * Could the specified index within a reservation of the smallest * possible size satisfy the alignment and boundary requirements? */ pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT; if ((pa & (alignment - 1)) != 0) return (NULL); size = npages << PAGE_SHIFT; if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* * Look for an existing reservation. */ - mpred = vm_radix_lookup_le(&object->rtree, pindex); if (mpred != NULL) { + KASSERT(mpred->object == object, + ("vm_reserv_alloc_contig: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); + ("vm_reserv_alloc_contig: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, - ("vm_reserv_alloc_contig: pindex already allocated")); + ("vm_reserv_alloc_contig: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } /* * Could at least one reservation fit between the first index to the * left that can be used ("leftcap") and the first index to the right * that cannot be used ("rightcap")? */ first = pindex - VM_RESERV_INDEX(object, pindex); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) return (NULL); } minpages = VM_RESERV_INDEX(object, pindex) + npages; maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES); allocpages = maxpages; if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + maxpages > rightcap) { if (maxpages == VM_LEVEL_0_NPAGES) return (NULL); /* * At least one reservation will fit between "leftcap" * and "rightcap". However, a reservation for the * last of the requested pages will not fit. Reduce * the size of the upcoming allocation accordingly. */ allocpages = minpages; } } /* * Would the last new reservation extend past the end of the object? */ if (first + maxpages > object->size) { /* * Don't allocate the last new reservation if the object is a * vnode or backed by another object that is a vnode. */ if (object->type == OBJT_VNODE || (object->backing_object != NULL && object->backing_object->type == OBJT_VNODE)) { if (maxpages == VM_LEVEL_0_NPAGES) return (NULL); allocpages = minpages; } /* Speculate that the object may grow. */ } /* * Allocate the physical pages. The alignment and boundary specified * for this allocation may be different from the alignment and * boundary specified for the requested pages. For instance, the * specified index may not be the first page within the first new * reservation. */ m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment, VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0); if (m == NULL) return (NULL); /* * The allocated physical pages always begin at a reservation * boundary, but they do not always end at a reservation boundary. * Initialize every reservation that is completely covered by the * allocated physical pages. */ m_ret = NULL; index = VM_RESERV_INDEX(object, pindex); do { rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_contig: reserv %p's pages is corrupted", rv)); KASSERT(rv->object == NULL, ("vm_reserv_alloc_contig: reserv %p isn't free", rv)); LIST_INSERT_HEAD(&object->rvq, rv, objq); rv->object = object; rv->pindex = first; KASSERT(rv->popcnt == 0, ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted", rv)); n = ulmin(VM_LEVEL_0_NPAGES - index, npages); for (i = 0; i < n; i++) vm_reserv_populate(rv, index + i); npages -= n; if (m_ret == NULL) { m_ret = &rv->pages[index]; index = 0; } m += VM_LEVEL_0_NPAGES; first += VM_LEVEL_0_NPAGES; allocpages -= VM_LEVEL_0_NPAGES; } while (allocpages >= VM_LEVEL_0_NPAGES); return (m_ret); /* * Found a matching reservation. */ found: index = VM_RESERV_INDEX(object, pindex); /* Does the allocation fit within the reservation? */ if (index + npages > VM_LEVEL_0_NPAGES) return (NULL); m = &rv->pages[index]; pa = VM_PAGE_TO_PHYS(m); if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) return (NULL); /* Handle vm_page_rename(m, new_object, ...). */ for (i = 0; i < npages; i++) if (popmap_is_set(rv->popmap, index + i)) return (NULL); for (i = 0; i < npages; i++) vm_reserv_populate(rv, index + i); return (m); } /* - * Allocates a page from an existing or newly-created reservation. + * Allocates a page from an existing or newly created reservation. * * The page "mpred" must immediately precede the offset "pindex" within the * specified object. * * The object and free page queue must be locked. */ vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t m, msucc; vm_pindex_t first, leftcap, rightcap; vm_reserv_t rv; int i, index; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(object); /* * Is a reservation fundamentally impossible? */ if (pindex < VM_RESERV_INDEX(object, pindex) || pindex >= object->size) return (NULL); /* * Look for an existing reservation. */ if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_reserv_alloc_page: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_reserv_alloc_page: mpred doesn't precede pindex")); rv = vm_reserv_from_page(mpred); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) { KASSERT(msucc->pindex > pindex, ("vm_reserv_alloc_page: msucc doesn't succeed pindex")); rv = vm_reserv_from_page(msucc); if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) goto found; } /* * Could a reservation fit between the first index to the left that * can be used and the first index to the right that cannot be used? */ first = pindex - VM_RESERV_INDEX(object, pindex); if (mpred != NULL) { if ((rv = vm_reserv_from_page(mpred))->object != object) leftcap = mpred->pindex + 1; else leftcap = rv->pindex + VM_LEVEL_0_NPAGES; if (leftcap > first) return (NULL); } if (msucc != NULL) { if ((rv = vm_reserv_from_page(msucc))->object != object) rightcap = msucc->pindex; else rightcap = rv->pindex; if (first + VM_LEVEL_0_NPAGES > rightcap) return (NULL); } /* * Would a new reservation extend past the end of the object? */ if (first + VM_LEVEL_0_NPAGES > object->size) { /* * Don't allocate a new reservation if the object is a vnode or * backed by another object that is a vnode. */ if (object->type == OBJT_VNODE || (object->backing_object != NULL && object->backing_object->type == OBJT_VNODE)) return (NULL); /* Speculate that the object may grow. */ } /* * Allocate and populate the new reservation. */ m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER); if (m == NULL) return (NULL); rv = vm_reserv_from_page(m); KASSERT(rv->pages == m, ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv)); KASSERT(rv->object == NULL, ("vm_reserv_alloc_page: reserv %p isn't free", rv)); LIST_INSERT_HEAD(&object->rvq, rv, objq); rv->object = object; rv->pindex = first; KASSERT(rv->popcnt == 0, ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv)); for (i = 0; i < NPOPMAP; i++) KASSERT(rv->popmap[i] == 0, ("vm_reserv_alloc_page: reserv %p's popmap is corrupted", rv)); index = VM_RESERV_INDEX(object, pindex); vm_reserv_populate(rv, index); return (&rv->pages[index]); /* * Found a matching reservation. */ found: index = VM_RESERV_INDEX(object, pindex); m = &rv->pages[index]; /* Handle vm_page_rename(m, new_object, ...). */ if (popmap_is_set(rv->popmap, index)) return (NULL); vm_reserv_populate(rv, index); return (m); } /* - * Breaks the given reservation. Except for the specified cached or free - * page, all cached and free pages in the reservation are returned to the - * physical memory allocator. The reservation's population count and map are - * reset to their initial state. + * Breaks the given reservation. Except for the specified free page, all free + * pages in the reservation are returned to the physical memory allocator. + * The reservation's population count and map are reset to their initial + * state. * - * The given reservation must not be in the partially-populated reservation + * The given reservation must not be in the partially populated reservation * queue. The free page queue lock must be held. */ static void vm_reserv_break(vm_reserv_t rv, vm_page_t m) { int begin_zeroes, hi, i, lo; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->object != NULL, ("vm_reserv_break: reserv %p is free", rv)); KASSERT(!rv->inpartpopq, ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv)); LIST_REMOVE(rv, objq); rv->object = NULL; if (m != NULL) { /* * Since the reservation is being broken, there is no harm in * abusing the population map to stop "m" from being returned * to the physical memory allocator. */ i = m - rv->pages; KASSERT(popmap_is_clear(rv->popmap, i), ("vm_reserv_break: reserv %p's popmap is corrupted", rv)); popmap_set(rv->popmap, i); rv->popcnt++; } i = hi = 0; do { /* Find the next 0 bit. Any previous 0 bits are < "hi". */ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); if (lo == 0) { /* Redundantly clears bits < "hi". */ rv->popmap[i] = 0; rv->popcnt -= NBPOPMAP - hi; while (++i < NPOPMAP) { lo = ffsl(~rv->popmap[i]); if (lo == 0) { rv->popmap[i] = 0; rv->popcnt -= NBPOPMAP; } else break; } if (i == NPOPMAP) break; hi = 0; } KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo)); /* Convert from ffsl() to ordinary bit numbering. */ lo--; if (lo > 0) { /* Redundantly clears bits < "hi". */ rv->popmap[i] &= ~((1UL << lo) - 1); rv->popcnt -= lo - hi; } begin_zeroes = NBPOPMAP * i + lo; /* Find the next 1 bit. */ do hi = ffsl(rv->popmap[i]); while (hi == 0 && ++i < NPOPMAP); if (i != NPOPMAP) /* Convert from ffsl() to ordinary bit numbering. */ hi--; vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i + hi - begin_zeroes); } while (i < NPOPMAP); KASSERT(rv->popcnt == 0, ("vm_reserv_break: reserv %p's popcnt is corrupted", rv)); vm_reserv_broken++; } /* * Breaks all reservations belonging to the given object. */ void vm_reserv_break_all(vm_object_t object) { vm_reserv_t rv; mtx_lock(&vm_page_queue_free_mtx); while ((rv = LIST_FIRST(&object->rvq)) != NULL) { KASSERT(rv->object == object, ("vm_reserv_break_all: reserv %p is corrupted", rv)); if (rv->inpartpopq) { TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; } vm_reserv_break(rv, NULL); } mtx_unlock(&vm_page_queue_free_mtx); } /* * Frees the given page if it belongs to a reservation. Returns TRUE if the * page is freed and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_free_page(vm_page_t m) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); if (rv->object == NULL) return (FALSE); vm_reserv_depopulate(rv, m - rv->pages); return (TRUE); } /* * Initializes the reservation management system. Specifically, initializes * the reservation array. * * Requires that vm_page_array and first_page are initialized! */ void vm_reserv_init(void) { vm_paddr_t paddr; struct vm_phys_seg *seg; int segind; /* * Initialize the reservation array. Specifically, initialize the * "pages" field for every element that has an underlying superpage. */ for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; paddr = roundup2(seg->start, VM_LEVEL_0_SIZE); while (paddr + VM_LEVEL_0_SIZE <= seg->end) { vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages = PHYS_TO_VM_PAGE(paddr); paddr += VM_LEVEL_0_SIZE; } } } /* * Returns true if the given page belongs to a reservation and that page is * free. Otherwise, returns false. */ bool vm_reserv_is_page_free(vm_page_t m) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); rv = vm_reserv_from_page(m); if (rv->object == NULL) return (false); return (popmap_is_clear(rv->popmap, m - rv->pages)); } /* * If the given page belongs to a reservation, returns the level of that * reservation. Otherwise, returns -1. */ int vm_reserv_level(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->object != NULL ? 0 : -1); } /* - * Returns a reservation level if the given page belongs to a fully-populated + * Returns a reservation level if the given page belongs to a fully populated * reservation and -1 otherwise. */ int vm_reserv_level_iffullpop(vm_page_t m) { vm_reserv_t rv; rv = vm_reserv_from_page(m); return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1); } /* - * Prepare for the reactivation of a cached page. + * Breaks the given partially populated reservation, releasing its free pages + * to the physical memory allocator. * - * First, suppose that the given page "m" was allocated individually, i.e., not - * as part of a reservation, and cached. Then, suppose a reservation - * containing "m" is allocated by the same object. Although "m" and the - * reservation belong to the same object, "m"'s pindex may not match the - * reservation's. - * - * The free page queue must be locked. - */ -boolean_t -vm_reserv_reactivate_page(vm_page_t m) -{ - vm_reserv_t rv; - int index; - - mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); - rv = vm_reserv_from_page(m); - if (rv->object == NULL) - return (FALSE); - KASSERT((m->flags & PG_CACHED) != 0, - ("vm_reserv_reactivate_page: page %p is not cached", m)); - if (m->object == rv->object && - m->pindex - rv->pindex == (index = VM_RESERV_INDEX(m->object, - m->pindex))) - vm_reserv_populate(rv, index); - else { - KASSERT(rv->inpartpopq, - ("vm_reserv_reactivate_page: reserv %p's inpartpopq is FALSE", - rv)); - TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); - rv->inpartpopq = FALSE; - /* Don't release "m" to the physical memory allocator. */ - vm_reserv_break(rv, m); - } - return (TRUE); -} - -/* - * Breaks the given partially-populated reservation, releasing its cached and - * free pages to the physical memory allocator. - * * The free page queue lock must be held. */ static void vm_reserv_reclaim(vm_reserv_t rv) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT(rv->inpartpopq, ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv)); TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq); rv->inpartpopq = FALSE; vm_reserv_break(rv, NULL); vm_reserv_reclaimed++; } /* - * Breaks the reservation at the head of the partially-populated reservation - * queue, releasing its cached and free pages to the physical memory - * allocator. Returns TRUE if a reservation is broken and FALSE otherwise. + * Breaks the reservation at the head of the partially populated reservation + * queue, releasing its free pages to the physical memory allocator. Returns + * TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_reclaim_inactive(void) { vm_reserv_t rv; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) { vm_reserv_reclaim(rv); return (TRUE); } return (FALSE); } /* - * Searches the partially-populated reservation queue for the least recently - * active reservation with unused pages, i.e., cached or free, that satisfy the - * given request for contiguous physical memory. If a satisfactory reservation - * is found, it is broken. Returns TRUE if a reservation is broken and FALSE - * otherwise. + * Searches the partially populated reservation queue for the least recently + * changed reservation with free pages that satisfy the given request for + * contiguous physical memory. If a satisfactory reservation is found, it is + * broken. Returns TRUE if a reservation is broken and FALSE otherwise. * * The free page queue lock must be held. */ boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { vm_paddr_t pa, size; vm_reserv_t rv; int hi, i, lo, low_index, next_free; mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); if (npages > VM_LEVEL_0_NPAGES - 1) return (FALSE); size = npages << PAGE_SHIFT; TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) { pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]); if (pa + PAGE_SIZE - size < low) { /* This entire reservation is too low; go to next. */ continue; } pa = VM_PAGE_TO_PHYS(&rv->pages[0]); if (pa + size > high) { /* This entire reservation is too high; go to next. */ continue; } if (pa < low) { /* Start the search for free pages at "low". */ low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT; i = low_index / NBPOPMAP; hi = low_index % NBPOPMAP; } else i = hi = 0; do { /* Find the next free page. */ lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i])); while (lo == 0 && ++i < NPOPMAP) lo = ffsl(~rv->popmap[i]); if (i == NPOPMAP) break; /* Convert from ffsl() to ordinary bit numbering. */ lo--; next_free = NBPOPMAP * i + lo; pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]); KASSERT(pa >= low, ("vm_reserv_reclaim_contig: pa is too low")); if (pa + size > high) { /* The rest of this reservation is too high. */ break; } else if ((pa & (alignment - 1)) != 0 || ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) { /* * The current page doesn't meet the alignment * and/or boundary requirements. Continue * searching this reservation until the rest * of its free pages are either excluded or * exhausted. */ hi = lo + 1; if (hi >= NBPOPMAP) { hi = 0; i++; } continue; } /* Find the next used page. */ hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1)); while (hi == 0 && ++i < NPOPMAP) { if ((NBPOPMAP * i - next_free) * PAGE_SIZE >= size) { vm_reserv_reclaim(rv); return (TRUE); } hi = ffsl(rv->popmap[i]); } /* Convert from ffsl() to ordinary bit numbering. */ if (i != NPOPMAP) hi--; if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >= size) { vm_reserv_reclaim(rv); return (TRUE); } } while (i < NPOPMAP); } return (FALSE); } /* * Transfers the reservation underlying the given page to a new object. * * The object must be locked. */ void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset) { vm_reserv_t rv; VM_OBJECT_ASSERT_WLOCKED(new_object); rv = vm_reserv_from_page(m); if (rv->object == old_object) { mtx_lock(&vm_page_queue_free_mtx); if (rv->object == old_object) { LIST_REMOVE(rv, objq); LIST_INSERT_HEAD(&new_object->rvq, rv, objq); rv->object = new_object; rv->pindex -= old_object_offset; } mtx_unlock(&vm_page_queue_free_mtx); } } /* * Returns the size (in bytes) of a reservation of the specified level. */ int vm_reserv_size(int level) { switch (level) { case 0: return (VM_LEVEL_0_SIZE); case -1: return (PAGE_SIZE); default: return (0); } } /* * Allocates the virtual and physical memory required by the reservation * management system's data structures, in particular, the reservation array. */ vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water) { vm_paddr_t new_end; size_t size; /* * Calculate the size (in bytes) of the reservation array. Round up * from "high_water" because every small page is mapped to an element * in the reservation array based on its physical address. Thus, the * number of elements in the reservation array can be greater than the * number of superpages. */ size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv); /* * Allocate and map the physical memory for the reservation array. The * next available virtual address is returned by reference. */ new_end = end - round_page(size); vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero(vm_reserv_array, size); /* * Return the next available physical address. */ return (new_end); } #endif /* VM_NRESERVLEVEL > 0 */ Index: stable/11/sys/vm/vm_reserv.h =================================================================== --- stable/11/sys/vm/vm_reserv.h (revision 318715) +++ stable/11/sys/vm/vm_reserv.h (revision 318716) @@ -1,71 +1,70 @@ /*- * Copyright (c) 2002-2006 Rice University * Copyright (c) 2007-2008 Alan L. Cox * All rights reserved. * * This software was developed for the FreeBSD Project by Alan L. Cox, * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Superpage reservation management definitions */ #ifndef _VM_RESERV_H_ #define _VM_RESERV_H_ #ifdef _KERNEL #if VM_NRESERVLEVEL > 0 /* * The following functions are only to be used by the virtual memory system. */ vm_page_t vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages, vm_paddr_t low, vm_paddr_t high, - u_long alignment, vm_paddr_t boundary); + u_long alignment, vm_paddr_t boundary, vm_page_t mpred); vm_page_t vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); void vm_reserv_break_all(vm_object_t object); boolean_t vm_reserv_free_page(vm_page_t m); void vm_reserv_init(void); bool vm_reserv_is_page_free(vm_page_t m); int vm_reserv_level(vm_page_t m); int vm_reserv_level_iffullpop(vm_page_t m); -boolean_t vm_reserv_reactivate_page(vm_page_t m); boolean_t vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary); boolean_t vm_reserv_reclaim_inactive(void); void vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object, vm_pindex_t old_object_offset); int vm_reserv_size(int level); vm_paddr_t vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water); #endif /* VM_NRESERVLEVEL > 0 */ #endif /* _KERNEL */ #endif /* !_VM_RESERV_H_ */ Index: stable/11/sys/vm/vnode_pager.c =================================================================== --- stable/11/sys/vm/vnode_pager.c (revision 318715) +++ stable/11/sys/vm/vnode_pager.c (revision 318716) @@ -1,1390 +1,1384 @@ /*- * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, }; int vnode_pbuf_freecnt; int vnode_async_pbuf_freecnt; /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) return (0); while ((object = vp->v_object) != NULL) { VM_OBJECT_WLOCK(object); if (!(object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(object); return (0); } VOP_UNLOCK(vp, 0); vm_object_set_flag(object, OBJ_DISCONNECTWNT); VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } if (size == 0) { if (vn_isdisk(vp, NULL)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred)) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ VM_OBJECT_WLOCK(object); object->ref_count--; VM_OBJECT_WUNLOCK(object); vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { /* * don't double-terminate the object */ if ((obj->flags & OBJ_DEAD) == 0) { vm_object_terminate(obj); } else { /* * Waiters were already handled during object * termination. The exclusive vnode lock hopefully * prevented new waiters from referencing the dying * object. */ KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0, ("OBJ_DISCONNECTWNT set obj %p flags %x", obj, obj->flags)); vp->v_object = NULL; VM_OBJECT_WUNLOCK(obj); } } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. * * MPSAFE */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *) handle; /* * If the object is being terminated, wait for it to * go away. */ retry: while ((object = vp->v_object) != NULL) { VM_OBJECT_WLOCK(object); if ((object->flags & OBJ_DEAD) == 0) break; vm_object_set_flag(object, OBJ_DISCONNECTWNT); VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0); } KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference")); if (object == NULL) { /* * Add an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->handle = handle; VI_LOCK(vp); if (vp->v_object != NULL) { /* * Object has been created while we were sleeping */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; object->ref_count = 0; VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); } else { object->ref_count++; #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif VM_OBJECT_WUNLOCK(object); } vrefact(vp); return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; if (object->flags & OBJ_DISCONNECTWNT) { vm_object_clear_flag(object, OBJ_DISCONNECTWNT); wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); while (refs-- > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_WLOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || vp->v_iflag & VI_DOOMED) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } VM_OBJECT_WUNLOCK(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_WLOCK(object); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; /* ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */ VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if ((nsize & PAGE_MASK) && (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && m->valid != 0) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks that * have been zeroed. Some of these valid bits may * have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round "base" to the next block boundary so that the * dirty bit for a partially zeroed block is not * cleared. */ base = roundup2(base, DEV_BSIZE); /* * Clear out partial-page dirty bits. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); - } else if ((nsize & PAGE_MASK) && - vm_page_is_cached(object, OFF_TO_IDX(nsize))) { - vm_page_cache_free(object, OFF_TO_IDX(nsize), - nobjsize); } } object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (vp->v_iflag & VI_DOOMED) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m->valid |= bits; VM_OBJECT_WUNLOCK(object); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) m->valid = VM_PAGE_BITS_ALL; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); VM_OBJECT_WLOCK(object); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); VM_OBJECT_WLOCK(object); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; int bsize, pagesperblock, *freecnt; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= sizeof(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (m[count - 1]->valid != 0 && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { relpbuf(bp, freecnt); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); relpbuf(bp, freecnt); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { - p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | - VM_ALLOC_IFNOTCACHED); + p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { - p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | - VM_ALLOC_IFNOTCACHED); + p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; KASSERT(bp->b_npages <= sizeof(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); atomic_add_long(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_async_pbuf_freecnt); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0; object = bp->b_vp->v_object; if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } VM_OBJECT_WLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ mt->valid = VM_PAGE_BITS_ALL; KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_WUNLOCK(object); if (error != 0) printf("%s: I/O read error %d\n", __func__, error); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_cnt.v_free_count + vm_cnt.v_cache_count < vm_cnt.v_pageout_free_min) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { vm_object_t object; vm_page_t m; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int count, error, i, maxsize, ncount, ppscheck; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n", (long)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page locked we are free to fix-up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_WLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { int pgoff; maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } } } VM_OBJECT_WUNLOCK(object); aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_td = (struct thread *) 0; error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags), curthread->td_ucred); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, ncount); ppscheck = 0; if (error) { if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1))) printf("vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { if (ppscheck || ppsratecheck(&lastfail, &curfail, 1)) printf("vnode_pager_putpages: residual I/O %zd at %lu\n", auio.uio_resid, (u_long)ma[0]->pindex); } for (i = 0; i < ncount; i++) { rtvals[i] = VM_PAGER_OK; } return rtvals[0]; } int vnode_pager_putpages_ioflags(int pager_flags) { int ioflags; /* * Pageouts are already clustered, use IO_ASYNC to force a * bawrite() rather then a bdwrite() to prevent paging I/O * from saturating the buffer cache. Dummy-up the sequential * heuristic to cause large ranges to cluster. If neither * IO_SYNC or IO_ASYNC is set, the system decides how to * cluster. */ ioflags = IO_VMIO; if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; #ifdef notyet ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; #endif ioflags |= IO_SEQMAX << IO_SEQSHIFT; return (ioflags); } void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) { vm_object_t obj; int i, pos; if (written == 0) return; obj = ma[0]->object; VM_OBJECT_WLOCK(obj); for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } VM_OBJECT_WUNLOCK(obj); } void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_ELOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_ELOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp, 0); vdrop(vp); if (mp != NULL) vn_finished_write(mp); } Index: stable/11/usr.bin/systat/systat.1 =================================================================== --- stable/11/usr.bin/systat/systat.1 (revision 318715) +++ stable/11/usr.bin/systat/systat.1 (revision 318716) @@ -1,686 +1,686 @@ .\" Copyright (c) 1985, 1990, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)systat.1 8.2 (Berkeley) 12/30/93 .\" $FreeBSD$ .\" .Dd December 5, 2016 .Dt SYSTAT 1 .Os .Sh NAME .Nm systat .Nd display system statistics .Sh SYNOPSIS .Nm .Op Fl display .Op Ar display-commands .Op Ar refresh-interval .Sh DESCRIPTION The .Nm utility displays various system statistics in a screen oriented fashion using the curses screen display library, .Xr ncurses 3 . .Pp While .Nm is running the screen is usually divided into two windows (an exception is the vmstat display which uses the entire screen). The upper window depicts the current system load average. The information displayed in the lower window may vary, depending on user commands. The last line on the screen is reserved for user input and error messages. .Pp By default .Nm displays the processes getting the largest percentage of the processor in the lower window. Other displays show swap space usage, disk .Tn I/O statistics (a la .Xr iostat 8 ) , virtual memory statistics (a la .Xr vmstat 8 ) , .Tn TCP/IP statistics, and network connections (a la .Xr netstat 1 ) . .Pp Input is interpreted at two different levels. A ``global'' command interpreter processes all keyboard input. If this command interpreter fails to recognize a command, the input line is passed to a per-display command interpreter. This allows each display to have certain display-specific commands. .Pp Command line options: .Bl -tag -width "refresh_interval" .It Fl Ns Ar display The .Fl flag expects .Ar display to be one of: .Ic icmp , .Ic icmp6 , .Ic ifstat , .Ic iostat , .Ic ip , .Ic ip6 , .Ic netstat , .Ic pigs , .Ic sctp , .Ic swap , .Ic tcp , .Ic vmstat , or .Ic zarc , These displays can also be requested interactively (without the .Dq Fl ) and are described in full detail below. .It Ar refresh-interval The .Ar refresh-value specifies the screen refresh time interval in seconds. Time interval can be fractional. .It Ar display-commands A list of commands specific to this display. These commands can also be entered interactively and are described for each display separately below. If the command requires arguments, they can be specified as separate command line arguments. A command line argument .Fl - will finish display commands. For example: .Pp .Dl Nm Fl ifstat Fl match Ar bge0,em1 Fl pps .Pp This will display statistics of packets per second for network interfaces named as bge0 and em1. .Pp .Dl Nm Fl iostat Fl numbers Fl - Ar 2.1 .Pp This will display all IO statistics in a numeric format and the information will be refreshed each 2.1 seconds. .El .Pp Certain characters cause immediate action by .Nm . These are .Bl -tag -width Fl .It Ic \&^L Refresh the screen. .It Ic \&^G Print the name of the current ``display'' being shown in the lower window and the refresh interval. .It Ic \&: Move the cursor to the command line and interpret the input line typed as a command. While entering a command the current character erase, word erase, and line kill characters may be used. .El .Pp The following commands are interpreted by the ``global'' command interpreter. .Bl -tag -width Fl .It Ic help Print the names of the available displays on the command line. .It Ic load Print the load average over the past 1, 5, and 15 minutes on the command line. .It Ic stop Stop refreshing the screen. .It Xo .Op Ic start .Op Ar number .Xc Start (continue) refreshing the screen. If a second, numeric, argument is provided it is interpreted as a refresh interval (in seconds). Supplying only a number will set the refresh interval to this value. .It Ic quit Exit .Nm . (This may be abbreviated to .Ic q . ) .El .Pp The available displays are: .Bl -tag -width Ic .It Ic pigs Display, in the lower window, those processes resident in main memory and getting the largest portion of the processor (the default display). When less than 100% of the processor is scheduled to user processes, the remaining time is accounted to the ``idle'' process. .It Ic icmp Display, in the lower window, statistics about messages received and transmitted by the Internet Control Message Protocol .Pq Dq Tn ICMP . The left half of the screen displays information about received packets, and the right half displays information regarding transmitted packets. .Pp The .Ic icmp display understands two commands: .Ic mode and .Ic reset . The .Ic mode command is used to select one of four display modes, given as its argument: .Bl -tag -width absoluteXX -compact .It Ic rate : show the rate of change of each value in packets (the default) per second .It Ic delta : show the rate of change of each value in packets per refresh interval .It Ic since : show the total change of each value since the display was last reset .It Ic absolute : show the absolute value of each statistic .El .Pp The .Ic reset command resets the baseline for .Ic since mode. The .Ic mode command with no argument will display the current mode in the command line. .It Ic icmp6 This display is like the .Ic icmp display, but displays statistics for IPv6 ICMP. .It Ic ip Otherwise identical to the .Ic icmp display, except that it displays .Tn IP and .Tn UDP statistics. .It Ic ip6 Like the .Ic ip display, except that it displays .Tn IPv6 statistics. It does not display .Tn UDP statistics. .It Ic sctp Like .Ic icmp , but with .Tn SCTP statistics. .It Ic tcp Like .Ic icmp , but with .Tn TCP statistics. .It Ic iostat Display, in the lower window, statistics about processor use and disk throughput. Statistics on processor use appear as bar graphs of the amount of time executing in user mode (``user''), in user mode running low priority processes (``nice''), in system mode (``system''), in interrupt mode (``interrupt''), and idle (``idle''). Statistics on disk throughput show, for each drive, megabytes per second, average number of disk transactions per second, and average kilobytes of data per transaction. This information may be displayed as bar graphs or as rows of numbers which scroll downward. Bar graphs are shown by default. .Pp The following commands are specific to the .Ic iostat display; the minimum unambiguous prefix may be supplied. .Pp .Bl -tag -width Fl -compact .It Cm numbers Show the disk .Tn I/O statistics in numeric form. Values are displayed in numeric columns which scroll downward. .It Cm bars Show the disk .Tn I/O statistics in bar graph form (default). .It Cm kbpt Toggle the display of kilobytes per transaction. (the default is to not display kilobytes per transaction). .El .It Ic swap Show information about swap space usage on all the swap areas compiled into the kernel. The first column is the device name of the partition. The next column is the total space available in the partition. The .Ar Used column indicates the total blocks used so far; the graph shows the percentage of space in use on each partition. If there are more than one swap partition in use, a total line is also shown. Areas known to the kernel, but not in use are shown as not available. .It Ic vmstat Take over the entire display and show a (rather crowded) compendium of statistics related to virtual memory usage, process scheduling, device interrupts, system name translation caching, disk .Tn I/O etc. .Pp The upper left quadrant of the screen shows the number of users logged in and the load average over the last one, five, and fifteen minute intervals. Below this line are statistics on memory utilization. The first row of the table reports memory usage only among active processes, that is processes that have run in the previous twenty seconds. The second row reports on memory usage of all processes. The first column reports on the number of kilobytes in physical pages claimed by processes. The second column reports the number of kilobytes in physical pages that are devoted to read only text pages. The third and fourth columns report the same two figures for virtual pages, that is the number of kilobytes in pages that would be needed if all processes had all of their pages. Finally the last column shows the number of kilobytes in physical pages on the free list. .Pp Below the memory display is a list of the average number of processes (over the last refresh interval) that are runnable (`r'), in page wait (`p'), in disk wait other than paging (`d'), sleeping (`s'), and swapped out but desiring to run (`w'). The row also shows the average number of context switches (`Csw'), traps (`Trp'; includes page faults), system calls (`Sys'), interrupts (`Int'), network software interrupts (`Sof'), and page faults (`Flt'). .Pp Below the process queue length listing is a numerical listing and a bar graph showing the amount of system (shown as `='), interrupt (shown as `+'), user (shown as `>'), nice (shown as `-'), and idle time (shown as ` '). .Pp Below the process display are statistics on name translations. It lists the number of names translated in the previous interval, the number and percentage of the translations that were handled by the system wide name translation cache, and the number and percentage of the translations that were handled by the per process name translation cache. .Pp To the right of the name translations display are lines showing the number of dirty buffers in the buffer cache (`dtbuf'), desired maximum size of vnode cache (`desvn'), number of vnodes actually allocated (`numvn'), and number of allocated vnodes that are free (`frevn'). .Pp At the bottom left is the disk usage display. It reports the number of kilobytes per transaction, transactions per second, megabytes per second and the percentage of the time the disk was busy averaged over the refresh period of the display (by default, five seconds). The system keeps statistics on most every storage device. In general, up to seven devices are displayed. The devices displayed by default are the first devices in the kernel's device list. See .Xr devstat 3 and .Xr devstat 9 for details on the devstat system. .Pp Under the date in the upper right hand quadrant are statistics on paging and swapping activity. The first two columns report the average number of pages brought in and out per second over the last refresh interval due to page faults and the paging daemon. The third and fourth columns report the average number of pages brought in and out per second over the last refresh interval due to swap requests initiated by the scheduler. The first row of the display shows the average number of disk transfers per second over the last refresh interval; the second row of the display shows the average number of pages transferred per second over the last refresh interval. .Pp Below the paging statistics is a column of lines regarding the virtual memory system. The first few lines describe, in units (except as noted below) of pages per second averaged over the sampling interval, pages copied on write (`cow'), pages zero filled on demand (`zfod'), pages optimally zero filled on demand (`ozfod'), the ratio of the (average) ozfod / zfod as a percentage (`%ozfod'), pages freed by the page daemon (`daefr'), pages freed by exiting processes (`prcfr'), total pages freed (`totfr'), pages reactivated from the free list (`react'), the average number of times per second that the page daemon was awakened (`pdwak'), pages analyzed by the page daemon (`pdpgs'), and in-transit blocking page faults (`intrn'). Note that the units are special for `%ozfod' and `pdwak'. The next few lines describe, as amounts of memory in kilobytes, pages wired down (`wire'), active pages (`act'), inactive pages (`inact'), -pages on the cache queue (`cache'), +dirty pages queued for laundering (`laund'), and free pages (`free'). Note that the values displayed are the current transient ones; they are not averages. .Pp At the bottom of this column is a line showing the amount of virtual memory, in kilobytes, mapped into the buffer cache (`buf'). This statistic is not useful. It exists only as a placeholder for the corresponding useful statistic (the amount of real memory used to cache disks). The most important component of the latter (the amount of real memory used by the vm system to cache disks) is not available, but can be guessed from the `inact' amount under some system loads. .Pp Running down the right hand side of the display is a breakdown of the interrupts being handled by the system. At the top of the list is the total interrupts per second over the time interval. The rest of the column breaks down the total on a device by device basis. Only devices that have interrupted at least once since boot time are shown. .Pp The following commands are specific to the .Ic vmstat display; the minimum unambiguous prefix may be supplied. .Pp .Bl -tag -width Ar -compact .It Cm boot Display cumulative statistics since the system was booted. .It Cm run Display statistics as a running total from the point this command is given. .It Cm time Display statistics averaged over the refresh interval (the default). .It Cm zero Reset running statistics to zero. .El .It Ic zarc display arc cache usage and hit/miss statistics. .It Ic netstat Display, in the lower window, network connections. By default, network servers awaiting requests are not displayed. Each address is displayed in the format ``host.port'', with each shown symbolically, when possible. It is possible to have addresses displayed numerically, limit the display to a set of ports, hosts, and/or protocols (the minimum unambiguous prefix may be supplied): .Pp .Bl -tag -width Ar -compact .It Cm all Toggle the displaying of server processes awaiting requests (this is the equivalent of the .Fl a flag to .Xr netstat 1 ) . .It Cm numbers Display network addresses numerically. .It Cm names Display network addresses symbolically. .It Cm proto Ar protocol Display only network connections using the indicated .Ar protocol . Supported protocols are ``tcp'', ``udp'', and ``all''. .It Cm ignore Op Ar items Do not display information about connections associated with the specified hosts or ports. Hosts and ports may be specified by name (``vangogh'', ``ftp''), or numerically. Host addresses use the Internet dot notation (``128.32.0.9''). Multiple items may be specified with a single command by separating them with spaces. .It Cm display Op Ar items Display information about the connections associated with the specified hosts or ports. As for .Ar ignore , .Op Ar items may be names or numbers. .It Cm show Op Ar ports\&|hosts Show, on the command line, the currently selected protocols, hosts, and ports. Hosts and ports which are being ignored are prefixed with a `!'. If .Ar ports or .Ar hosts is supplied as an argument to .Cm show , then only the requested information will be displayed. .It Cm reset Reset the port, host, and protocol matching mechanisms to the default (any protocol, port, or host). .El .It Ic ifstat Display the network traffic going through active interfaces on the system. Idle interfaces will not be displayed until they receive some traffic. .Pp For each interface being displayed, the current, peak and total statistics are displayed for incoming and outgoing traffic. By default, the .Ic ifstat display will automatically scale the units being used so that they are in a human-readable format. The scaling units used for the current and peak traffic columns can be altered by the .Ic scale command. .Bl -tag -width ".Cm scale Op Ar units" .It Cm scale Op Ar units Modify the scale used to display the current and peak traffic over all interfaces. The following units are recognised: kbit, kbyte, mbit, mbyte, gbit, gbyte and auto. .It Cm pps Show statistics in packets per second instead of bytes/bits per second. A subsequent call of .Ic pps switches this mode off. .It Cm match Op Ar patterns Display only interfaces that match pattern provided as an argument. Patterns should be in shell syntax separated by whitespaces or commas. If this command is called without arguments then all interfaces are displayed. For example: .Pp .Dl match em0, bge1 .Pp This will display em0 and bge1 interfaces. .Pp .Dl match em*, bge*, lo0 .Pp This will display all .Ic em interfaces, all .Ic bge interfaces and the loopback interface. .El .El .Pp Commands to switch between displays may be abbreviated to the minimum unambiguous prefix; for example, ``io'' for ``iostat''. Certain information may be discarded when the screen size is insufficient for display. For example, on a machine with 10 drives the .Ic iostat bar graph displays only 3 drives on a 24 line terminal. When a bar graph would overflow the allotted screen space it is truncated and the actual value is printed ``over top'' of the bar. .Pp The following commands are common to each display which shows information about disk drives. These commands are used to select a set of drives to report on, should your system have more drives configured than can normally be displayed on the screen. .Pp .Bl -tag -width Ar -compact .It Cm ignore Op Ar drives Do not display information about the drives indicated. Multiple drives may be specified, separated by spaces. .It Cm display Op Ar drives Display information about the drives indicated. Multiple drives may be specified, separated by spaces. .It Cm only Op Ar drives Display only the specified drives. Multiple drives may be specified, separated by spaces. .It Cm drives Display a list of available devices. .It Cm match Xo .Ar type , Ns Ar if , Ns Ar pass .Op | Ar ... .Xc Display devices matching the given pattern. The basic matching expressions are the same as those used in .Xr iostat 8 with one difference. Instead of specifying multiple .Fl t arguments which are then ORed together, the user instead specifies multiple matching expressions joined by the pipe .Pq Ql \&| character. The comma separated arguments within each matching expression are ANDed together, and then the pipe separated matching expressions are ORed together. Any device matching the combined expression will be displayed, if there is room to display it. For example: .Pp .Dl match da,scsi | cd,ide .Pp This will display all SCSI Direct Access devices and all IDE CDROM devices. .Pp .Dl match da | sa | cd,pass .Pp This will display all Direct Access devices, all Sequential Access devices, and all passthrough devices that provide access to CDROM drives. .El .Sh FILES .Bl -tag -width /boot/kernel/kernel -compact .It Pa /boot/kernel/kernel For the namelist. .It Pa /dev/kmem For information in main memory. .It Pa /etc/hosts For host names. .It Pa /etc/networks For network names. .It Pa /etc/services For port names. .El .Sh SEE ALSO .Xr netstat 1 , .Xr kvm 3 , .Xr icmp 4 , .Xr icmp6 4 , .Xr ip 4 , .Xr ip6 4 , .Xr tcp 4 , .Xr udp 4 , .Xr gstat 8 , .Xr iostat 8 , .Xr vmstat 8 .Sh HISTORY The .Nm program appeared in .Bx 4.3 . The .Ic icmp , .Ic ip , and .Ic tcp displays appeared in .Fx 3.0 ; the notion of having different display modes for the .Tn ICMP , .Tn IP , .Tn TCP , and .Tn UDP statistics was stolen from the .Fl C option to .Xr netstat 1 in Silicon Graphics' .Tn IRIX system. .Sh BUGS Certain displays presume a minimum of 80 characters per line. The .Ic vmstat display looks out of place because it is (it was added in as a separate display rather than created as a new program). Index: stable/11/usr.bin/systat/vmstat.c =================================================================== --- stable/11/usr.bin/systat/vmstat.c (revision 318715) +++ stable/11/usr.bin/systat/vmstat.c (revision 318716) @@ -1,896 +1,896 @@ /*- * Copyright (c) 1983, 1989, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef lint static const char sccsid[] = "@(#)vmstat.c 8.2 (Berkeley) 1/12/94"; #endif /* * Cursed vmstat -- from Robert Elz. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "systat.h" #include "extern.h" #include "devs.h" static struct Info { long time[CPUSTATES]; u_int v_swtch; /* context switches */ u_int v_trap; /* calls to trap */ u_int v_syscall; /* calls to syscall() */ u_int v_intr; /* device interrupts */ u_int v_soft; /* software interrupts */ /* * Virtual memory activity. */ u_int v_vm_faults; /* number of address memory faults */ u_int v_io_faults; /* page faults requiring I/O */ u_int v_cow_faults; /* number of copy-on-writes */ u_int v_zfod; /* pages zero filled on demand */ u_int v_ozfod; /* optimized zero fill pages */ u_int v_swapin; /* swap pager pageins */ u_int v_swapout; /* swap pager pageouts */ u_int v_swappgsin; /* swap pager pages paged in */ u_int v_swappgsout; /* swap pager pages paged out */ u_int v_vnodein; /* vnode pager pageins */ u_int v_vnodeout; /* vnode pager pageouts */ u_int v_vnodepgsin; /* vnode_pager pages paged in */ u_int v_vnodepgsout; /* vnode pager pages paged out */ u_int v_intrans; /* intransit blocking page faults */ - u_int v_reactivated; /* number of pages reactivated from free list */ + u_int v_reactivated; /* number of pages reactivated by pagedaemon */ u_int v_pdwakeups; /* number of times daemon has awaken from sleep */ u_int v_pdpages; /* number of pages analyzed by daemon */ u_int v_dfree; /* pages freed by daemon */ u_int v_pfree; /* pages freed by exiting processes */ u_int v_tfree; /* total pages freed */ /* * Distribution of page usages. */ u_int v_page_size; /* page size in bytes */ u_int v_free_count; /* number of pages free */ u_int v_wire_count; /* number of pages wired down */ u_int v_active_count; /* number of pages active */ u_int v_inactive_count; /* number of pages inactive */ - u_int v_cache_count; /* number of pages on buffer cache queue */ + u_int v_laundry_count; /* number of pages in laundry queue */ u_long v_kmem_map_size; /* Current kmem allocation size */ struct vmtotal Total; struct nchstats nchstats; long nchcount; long *intrcnt; long bufspace; int desiredvnodes; long numvnodes; long freevnodes; int numdirtybuffers; } s, s1, s2, z; static u_long kmem_size; static u_int v_page_count; struct statinfo cur, last, run; #define total s.Total #define nchtotal s.nchstats #define oldnchtotal s1.nchstats static enum state { BOOT, TIME, RUN } state = TIME; static void allocinfo(struct Info *); static void copyinfo(struct Info *, struct Info *); static float cputime(int); static void dinfo(int, int, struct statinfo *, struct statinfo *); static void getinfo(struct Info *); static void putint(int, int, int, int); static void putfloat(double, int, int, int, int, int); static void putlongdouble(long double, int, int, int, int, int); static int ucount(void); static int ncpu; static char buf[26]; static time_t t; static double etime; static int nintr; static long *intrloc; static char **intrname; static int nextintsrow; WINDOW * openkre(void) { return (stdscr); } void closekre(WINDOW *w) { if (w == NULL) return; wclear(w); wrefresh(w); } /* * These constants define where the major pieces are laid out */ #define STATROW 0 /* uses 1 row and 67 cols */ #define STATCOL 0 #define MEMROW 2 /* uses 4 rows and 45 cols */ #define MEMCOL 0 #define PAGEROW 2 /* uses 4 rows and 30 cols */ #define PAGECOL 47 #define INTSROW 6 /* uses all rows to bottom and 16 cols */ #define INTSCOL 64 #define PROCSROW 6 /* uses 3 rows and 19 cols */ #define PROCSCOL 0 #define GENSTATROW 7 /* uses 2 rows and 29 cols */ #define GENSTATCOL 21 #define VMSTATROW 7 /* uses 17 rows and 12-14 cols */ #define VMSTATCOL 49 /* actually 50-51 for some fields */ #define GRAPHROW 10 /* uses 3 rows and 49-51 cols */ #define GRAPHCOL 0 #define VNSTATROW 13 /* uses 4 rows and 13 columns */ #define VNSTATCOL 35 #define NAMEIROW 14 /* uses 3 rows and 32 cols */ #define NAMEICOL 0 #define DISKROW 18 /* uses 5 rows and 47 cols (for 7 drives) */ #define DISKCOL 0 #define DRIVESPACE 7 /* max # for space */ #define MAXDRIVES DRIVESPACE /* max # to display */ int initkre(void) { char *cp, *cp1, *cp2, *intrnamebuf, *nextcp; int i; size_t sz; if ((num_devices = devstat_getnumdevs(NULL)) < 0) { warnx("%s", devstat_errbuf); return(0); } cur.dinfo = calloc(1, sizeof(struct devinfo)); last.dinfo = calloc(1, sizeof(struct devinfo)); run.dinfo = calloc(1, sizeof(struct devinfo)); if (dsinit(MAXDRIVES, &cur, &last, &run) != 1) return(0); if (nintr == 0) { if (sysctlbyname("hw.intrcnt", NULL, &sz, NULL, 0) == -1) { error("sysctl(hw.intrcnt...) failed: %s", strerror(errno)); return (0); } nintr = sz / sizeof(u_long); intrloc = calloc(nintr, sizeof (long)); intrname = calloc(nintr, sizeof (char *)); intrnamebuf = sysctl_dynread("hw.intrnames", NULL); if (intrnamebuf == NULL || intrname == NULL || intrloc == NULL) { error("Out of memory"); if (intrnamebuf) free(intrnamebuf); if (intrname) free(intrname); if (intrloc) free(intrloc); nintr = 0; return(0); } for (cp = intrnamebuf, i = 0; i < nintr; i++) { nextcp = cp + strlen(cp) + 1; /* Discard trailing spaces. */ for (cp1 = nextcp - 1; cp1 > cp && *(cp1 - 1) == ' '; ) *--cp1 = '\0'; /* Convert "irqN: name" to "name irqN". */ if (strncmp(cp, "irq", 3) == 0) { cp1 = cp + 3; while (isdigit((u_char)*cp1)) cp1++; if (cp1 != cp && *cp1 == ':' && *(cp1 + 1) == ' ') { sz = strlen(cp); *cp1 = '\0'; cp1 = cp1 + 2; cp2 = strdup(cp); bcopy(cp1, cp, sz - (cp1 - cp) + 1); if (sz <= 10 + 4) { strcat(cp, " "); strcat(cp, cp2 + 3); } free(cp2); } } /* * Convert "name irqN" to "name N" if the former is * longer than the field width. */ if ((cp1 = strstr(cp, "irq")) != NULL && strlen(cp) > 10) bcopy(cp1 + 3, cp1, strlen(cp1 + 3) + 1); intrname[i] = cp; cp = nextcp; } nextintsrow = INTSROW + 2; allocinfo(&s); allocinfo(&s1); allocinfo(&s2); allocinfo(&z); } GETSYSCTL("vm.kmem_size", kmem_size); GETSYSCTL("vm.stats.vm.v_page_count", v_page_count); getinfo(&s2); copyinfo(&s2, &s1); return(1); } void fetchkre(void) { time_t now; struct tm *tp; static int d_first = -1; if (d_first < 0) d_first = (*nl_langinfo(D_MD_ORDER) == 'd'); time(&now); tp = localtime(&now); (void) strftime(buf, sizeof(buf), d_first ? "%e %b %R" : "%b %e %R", tp); getinfo(&s); } void labelkre(void) { int i, j; clear(); mvprintw(STATROW, STATCOL + 6, "users Load"); mvprintw(STATROW + 1, STATCOL + 3, "Mem usage: %%Phy %%Kmem"); mvprintw(MEMROW, MEMCOL, "Mem: KB REAL VIRTUAL"); mvprintw(MEMROW + 1, MEMCOL, " Tot Share Tot Share"); mvprintw(MEMROW + 2, MEMCOL, "Act"); mvprintw(MEMROW + 3, MEMCOL, "All"); mvprintw(MEMROW + 1, MEMCOL + 41, "Free"); mvprintw(PAGEROW, PAGECOL, " VN PAGER SWAP PAGER"); mvprintw(PAGEROW + 1, PAGECOL, " in out in out"); mvprintw(PAGEROW + 2, PAGECOL, "count"); mvprintw(PAGEROW + 3, PAGECOL, "pages"); mvprintw(INTSROW, INTSCOL + 1, "Interrupts"); mvprintw(INTSROW + 1, INTSCOL + 6, "total"); mvprintw(VMSTATROW, VMSTATCOL + 9, "ioflt"); mvprintw(VMSTATROW + 1, VMSTATCOL + 9, "cow"); mvprintw(VMSTATROW + 2, VMSTATCOL + 9, "zfod"); mvprintw(VMSTATROW + 3, VMSTATCOL + 9, "ozfod"); mvprintw(VMSTATROW + 4, VMSTATCOL + 9 - 1, "%%ozfod"); mvprintw(VMSTATROW + 5, VMSTATCOL + 9, "daefr"); mvprintw(VMSTATROW + 6, VMSTATCOL + 9, "prcfr"); mvprintw(VMSTATROW + 7, VMSTATCOL + 9, "totfr"); mvprintw(VMSTATROW + 8, VMSTATCOL + 9, "react"); mvprintw(VMSTATROW + 9, VMSTATCOL + 9, "pdwak"); mvprintw(VMSTATROW + 10, VMSTATCOL + 9, "pdpgs"); mvprintw(VMSTATROW + 11, VMSTATCOL + 9, "intrn"); mvprintw(VMSTATROW + 12, VMSTATCOL + 9, "wire"); mvprintw(VMSTATROW + 13, VMSTATCOL + 9, "act"); mvprintw(VMSTATROW + 14, VMSTATCOL + 9, "inact"); - mvprintw(VMSTATROW + 15, VMSTATCOL + 9, "cache"); + mvprintw(VMSTATROW + 15, VMSTATCOL + 9, "laund"); mvprintw(VMSTATROW + 16, VMSTATCOL + 9, "free"); if (LINES - 1 > VMSTATROW + 17) mvprintw(VMSTATROW + 17, VMSTATCOL + 9, "buf"); mvprintw(GENSTATROW, GENSTATCOL, " Csw Trp Sys Int Sof Flt"); mvprintw(GRAPHROW, GRAPHCOL, " . %%Sys . %%Intr . %%User . %%Nice . %%Idle"); mvprintw(PROCSROW, PROCSCOL, "Proc:"); mvprintw(PROCSROW + 1, PROCSCOL, " r p d s w"); mvprintw(GRAPHROW + 1, GRAPHCOL, "| | | | | | | | | | |"); mvprintw(VNSTATROW, VNSTATCOL + 8, "dtbuf"); mvprintw(VNSTATROW + 1, VNSTATCOL + 8, "desvn"); mvprintw(VNSTATROW + 2, VNSTATCOL + 8, "numvn"); mvprintw(VNSTATROW + 3, VNSTATCOL + 8, "frevn"); mvprintw(NAMEIROW, NAMEICOL, "Namei Name-cache Dir-cache"); mvprintw(NAMEIROW + 1, NAMEICOL, " Calls hits %% hits %%"); mvprintw(DISKROW, DISKCOL, "Disks"); mvprintw(DISKROW + 1, DISKCOL, "KB/t"); mvprintw(DISKROW + 2, DISKCOL, "tps"); mvprintw(DISKROW + 3, DISKCOL, "MB/s"); mvprintw(DISKROW + 4, DISKCOL, "%%busy"); /* * For now, we don't support a fourth disk statistic. So there's * no point in providing a label for it. If someone can think of a * fourth useful disk statistic, there is room to add it. */ /* mvprintw(DISKROW + 4, DISKCOL, " msps"); */ j = 0; for (i = 0; i < num_devices && j < MAXDRIVES; i++) if (dev_select[i].selected) { char tmpstr[80]; sprintf(tmpstr, "%s%d", dev_select[i].device_name, dev_select[i].unit_number); mvprintw(DISKROW, DISKCOL + 5 + 6 * j, " %5.5s", tmpstr); j++; } for (i = 0; i < nintr; i++) { if (intrloc[i] == 0) continue; mvprintw(intrloc[i], INTSCOL + 6, "%-10.10s", intrname[i]); } } #define X(fld) {t=s.fld[i]; s.fld[i]-=s1.fld[i]; if(state==TIME) s1.fld[i]=t;} #define Q(fld) {t=cur.fld[i]; cur.fld[i]-=last.fld[i]; if(state==TIME) last.fld[i]=t;} #define Y(fld) {t = s.fld; s.fld -= s1.fld; if(state == TIME) s1.fld = t;} #define Z(fld) {t = s.nchstats.fld; s.nchstats.fld -= s1.nchstats.fld; \ if(state == TIME) s1.nchstats.fld = t;} #define PUTRATE(fld, l, c, w) \ do { \ Y(fld); \ putint((int)((float)s.fld/etime + 0.5), l, c, w); \ } while (0) #define MAXFAIL 5 static char cpuchar[CPUSTATES] = { '=' , '+', '>', '-', ' ' }; static char cpuorder[CPUSTATES] = { CP_SYS, CP_INTR, CP_USER, CP_NICE, CP_IDLE }; void showkre(void) { float f1, f2; int psiz, inttotal; int i, l, lc; static int failcnt = 0; etime = 0; for(i = 0; i < CPUSTATES; i++) { X(time); Q(cp_time); etime += s.time[i]; } if (etime < 5.0) { /* < 5 ticks - ignore this trash */ if (failcnt++ >= MAXFAIL) { clear(); mvprintw(2, 10, "The alternate system clock has died!"); mvprintw(3, 10, "Reverting to ``pigs'' display."); move(CMDLINE, 0); refresh(); failcnt = 0; sleep(5); command("pigs"); } return; } failcnt = 0; etime /= hertz; etime /= ncpu; inttotal = 0; for (i = 0; i < nintr; i++) { if (s.intrcnt[i] == 0) continue; X(intrcnt); l = (int)((float)s.intrcnt[i]/etime + 0.5); inttotal += l; if (intrloc[i] == 0) { if (nextintsrow == LINES) continue; intrloc[i] = nextintsrow++; mvprintw(intrloc[i], INTSCOL + 6, "%-10.10s", intrname[i]); } putint(l, intrloc[i], INTSCOL, 5); } putint(inttotal, INTSROW + 1, INTSCOL, 5); Z(ncs_goodhits); Z(ncs_badhits); Z(ncs_miss); Z(ncs_long); Z(ncs_pass2); Z(ncs_2passes); Z(ncs_neghits); s.nchcount = nchtotal.ncs_goodhits + nchtotal.ncs_badhits + nchtotal.ncs_miss + nchtotal.ncs_long + nchtotal.ncs_neghits; if (state == TIME) s1.nchcount = s.nchcount; psiz = 0; f2 = 0.0; for (lc = 0; lc < CPUSTATES; lc++) { i = cpuorder[lc]; f1 = cputime(i); f2 += f1; l = (int) ((f2 + 1.0) / 2.0) - psiz; putfloat(f1, GRAPHROW, GRAPHCOL + 10 * lc, 4, 1, 0); move(GRAPHROW + 2, psiz); psiz += l; while (l-- > 0) addch(cpuchar[lc]); } putint(ucount(), STATROW, STATCOL, 5); putfloat(avenrun[0], STATROW, STATCOL + 20, 5, 2, 0); putfloat(avenrun[1], STATROW, STATCOL + 26, 5, 2, 0); putfloat(avenrun[2], STATROW, STATCOL + 32, 5, 2, 0); mvaddstr(STATROW, STATCOL + 55, buf); #define pgtokb(pg) ((pg) * (s.v_page_size / 1024)) putfloat(100.0 * (v_page_count - total.t_free) / v_page_count, STATROW + 1, STATCOL + 15, 2, 0, 1); putfloat(100.0 * s.v_kmem_map_size / kmem_size, STATROW + 1, STATCOL + 22, 2, 0, 1); putint(pgtokb(total.t_arm), MEMROW + 2, MEMCOL + 4, 7); putint(pgtokb(total.t_armshr), MEMROW + 2, MEMCOL + 12, 7); putint(pgtokb(total.t_avm), MEMROW + 2, MEMCOL + 20, 8); putint(pgtokb(total.t_avmshr), MEMROW + 2, MEMCOL + 29, 8); putint(pgtokb(total.t_rm), MEMROW + 3, MEMCOL + 4, 7); putint(pgtokb(total.t_rmshr), MEMROW + 3, MEMCOL + 12, 7); putint(pgtokb(total.t_vm), MEMROW + 3, MEMCOL + 20, 8); putint(pgtokb(total.t_vmshr), MEMROW + 3, MEMCOL + 29, 8); putint(pgtokb(total.t_free), MEMROW + 2, MEMCOL + 38, 7); putint(total.t_rq - 1, PROCSROW + 2, PROCSCOL, 3); putint(total.t_pw, PROCSROW + 2, PROCSCOL + 4, 3); putint(total.t_dw, PROCSROW + 2, PROCSCOL + 8, 3); putint(total.t_sl, PROCSROW + 2, PROCSCOL + 12, 3); putint(total.t_sw, PROCSROW + 2, PROCSCOL + 16, 3); PUTRATE(v_io_faults, VMSTATROW, VMSTATCOL + 2, 8 - 2); PUTRATE(v_cow_faults, VMSTATROW + 1, VMSTATCOL + 2, 8 - 2); PUTRATE(v_zfod, VMSTATROW + 2, VMSTATCOL + 2, 8 - 2); PUTRATE(v_ozfod, VMSTATROW + 3, VMSTATCOL, 8); putint(s.v_zfod != 0 ? (int)(s.v_ozfod * 100.0 / s.v_zfod) : 0, VMSTATROW + 4, VMSTATCOL + 1, 8 - 1); PUTRATE(v_dfree, VMSTATROW + 5, VMSTATCOL + 2, 8 - 2); PUTRATE(v_pfree, VMSTATROW + 6, VMSTATCOL + 2, 8 - 2); PUTRATE(v_tfree, VMSTATROW + 7, VMSTATCOL, 8); PUTRATE(v_reactivated, VMSTATROW + 8, VMSTATCOL, 8); PUTRATE(v_pdwakeups, VMSTATROW + 9, VMSTATCOL, 8); PUTRATE(v_pdpages, VMSTATROW + 10, VMSTATCOL, 8); PUTRATE(v_intrans, VMSTATROW + 11, VMSTATCOL, 8); putint(pgtokb(s.v_wire_count), VMSTATROW + 12, VMSTATCOL, 8); putint(pgtokb(s.v_active_count), VMSTATROW + 13, VMSTATCOL, 8); putint(pgtokb(s.v_inactive_count), VMSTATROW + 14, VMSTATCOL, 8); - putint(pgtokb(s.v_cache_count), VMSTATROW + 15, VMSTATCOL, 8); + putint(pgtokb(s.v_laundry_count), VMSTATROW + 15, VMSTATCOL, 8); putint(pgtokb(s.v_free_count), VMSTATROW + 16, VMSTATCOL, 8); if (LINES - 1 > VMSTATROW + 17) putint(s.bufspace / 1024, VMSTATROW + 17, VMSTATCOL, 8); PUTRATE(v_vnodein, PAGEROW + 2, PAGECOL + 6, 5); PUTRATE(v_vnodeout, PAGEROW + 2, PAGECOL + 12, 5); PUTRATE(v_swapin, PAGEROW + 2, PAGECOL + 19, 5); PUTRATE(v_swapout, PAGEROW + 2, PAGECOL + 25, 5); PUTRATE(v_vnodepgsin, PAGEROW + 3, PAGECOL + 6, 5); PUTRATE(v_vnodepgsout, PAGEROW + 3, PAGECOL + 12, 5); PUTRATE(v_swappgsin, PAGEROW + 3, PAGECOL + 19, 5); PUTRATE(v_swappgsout, PAGEROW + 3, PAGECOL + 25, 5); PUTRATE(v_swtch, GENSTATROW + 1, GENSTATCOL, 4); PUTRATE(v_trap, GENSTATROW + 1, GENSTATCOL + 5, 4); PUTRATE(v_syscall, GENSTATROW + 1, GENSTATCOL + 10, 4); PUTRATE(v_intr, GENSTATROW + 1, GENSTATCOL + 15, 4); PUTRATE(v_soft, GENSTATROW + 1, GENSTATCOL + 20, 4); PUTRATE(v_vm_faults, GENSTATROW + 1, GENSTATCOL + 25, 4); for (i = 0, lc = 0; i < num_devices && lc < MAXDRIVES; i++) if (dev_select[i].selected) { switch(state) { case TIME: dinfo(i, ++lc, &cur, &last); break; case RUN: dinfo(i, ++lc, &cur, &run); break; case BOOT: dinfo(i, ++lc, &cur, NULL); break; } } putint(s.numdirtybuffers, VNSTATROW, VNSTATCOL, 7); putint(s.desiredvnodes, VNSTATROW + 1, VNSTATCOL, 7); putint(s.numvnodes, VNSTATROW + 2, VNSTATCOL, 7); putint(s.freevnodes, VNSTATROW + 3, VNSTATCOL, 7); putint(s.nchcount, NAMEIROW + 2, NAMEICOL, 8); putint((nchtotal.ncs_goodhits + nchtotal.ncs_neghits), NAMEIROW + 2, NAMEICOL + 9, 7); #define nz(x) ((x) ? (x) : 1) putfloat((nchtotal.ncs_goodhits+nchtotal.ncs_neghits) * 100.0 / nz(s.nchcount), NAMEIROW + 2, NAMEICOL + 17, 3, 0, 1); putint(nchtotal.ncs_pass2, NAMEIROW + 2, NAMEICOL + 21, 7); putfloat(nchtotal.ncs_pass2 * 100.0 / nz(s.nchcount), NAMEIROW + 2, NAMEICOL + 29, 3, 0, 1); #undef nz } int cmdkre(const char *cmd, const char *args) { int retval; if (prefix(cmd, "run")) { retval = 1; copyinfo(&s2, &s1); switch (devstat_getdevs(NULL, &run)) { case -1: errx(1, "%s", devstat_errbuf); break; case 1: num_devices = run.dinfo->numdevs; generation = run.dinfo->generation; retval = dscmd("refresh", NULL, MAXDRIVES, &cur); if (retval == 2) labelkre(); break; default: break; } state = RUN; return (retval); } if (prefix(cmd, "boot")) { state = BOOT; copyinfo(&z, &s1); return (1); } if (prefix(cmd, "time")) { state = TIME; return (1); } if (prefix(cmd, "zero")) { retval = 1; if (state == RUN) { getinfo(&s1); switch (devstat_getdevs(NULL, &run)) { case -1: errx(1, "%s", devstat_errbuf); break; case 1: num_devices = run.dinfo->numdevs; generation = run.dinfo->generation; retval = dscmd("refresh",NULL, MAXDRIVES, &cur); if (retval == 2) labelkre(); break; default: break; } } return (retval); } retval = dscmd(cmd, args, MAXDRIVES, &cur); if (retval == 2) labelkre(); return(retval); } /* calculate number of users on the system */ static int ucount(void) { int nusers = 0; struct utmpx *ut; setutxent(); while ((ut = getutxent()) != NULL) if (ut->ut_type == USER_PROCESS) nusers++; endutxent(); return (nusers); } static float cputime(int indx) { double lt; int i; lt = 0; for (i = 0; i < CPUSTATES; i++) lt += s.time[i]; if (lt == 0.0) lt = 1.0; return (s.time[indx] * 100.0 / lt); } static void putint(int n, int l, int lc, int w) { int snr; char b[128]; move(l, lc); #ifdef DEBUG while (w-- > 0) addch('*'); return; #endif if (n == 0) { while (w-- > 0) addch(' '); return; } snr = snprintf(b, sizeof(b), "%*d", w, n); if (snr != w) snr = snprintf(b, sizeof(b), "%*dk", w - 1, n / 1000); if (snr != w) snr = snprintf(b, sizeof(b), "%*dM", w - 1, n / 1000000); if (snr != w) { while (w-- > 0) addch('*'); return; } addstr(b); } static void putfloat(double f, int l, int lc, int w, int d, int nz) { int snr; char b[128]; move(l, lc); #ifdef DEBUG while (--w >= 0) addch('*'); return; #endif if (nz && f == 0.0) { while (--w >= 0) addch(' '); return; } snr = snprintf(b, sizeof(b), "%*.*f", w, d, f); if (snr != w) snr = snprintf(b, sizeof(b), "%*.0f", w, f); if (snr != w) snr = snprintf(b, sizeof(b), "%*.0fk", w - 1, f / 1000); if (snr != w) snr = snprintf(b, sizeof(b), "%*.0fM", w - 1, f / 1000000); if (snr != w) { while (--w >= 0) addch('*'); return; } addstr(b); } static void putlongdouble(long double f, int l, int lc, int w, int d, int nz) { int snr; char b[128]; move(l, lc); #ifdef DEBUG while (--w >= 0) addch('*'); return; #endif if (nz && f == 0.0) { while (--w >= 0) addch(' '); return; } snr = snprintf(b, sizeof(b), "%*.*Lf", w, d, f); if (snr != w) snr = snprintf(b, sizeof(b), "%*.0Lf", w, f); if (snr != w) snr = snprintf(b, sizeof(b), "%*.0Lfk", w - 1, f / 1000); if (snr != w) snr = snprintf(b, sizeof(b), "%*.0LfM", w - 1, f / 1000000); if (snr != w) { while (--w >= 0) addch('*'); return; } addstr(b); } static void getinfo(struct Info *ls) { struct devinfo *tmp_dinfo; size_t size; int mib[2]; GETSYSCTL("kern.cp_time", ls->time); GETSYSCTL("kern.cp_time", cur.cp_time); GETSYSCTL("vm.stats.sys.v_swtch", ls->v_swtch); GETSYSCTL("vm.stats.sys.v_trap", ls->v_trap); GETSYSCTL("vm.stats.sys.v_syscall", ls->v_syscall); GETSYSCTL("vm.stats.sys.v_intr", ls->v_intr); GETSYSCTL("vm.stats.sys.v_soft", ls->v_soft); GETSYSCTL("vm.stats.vm.v_vm_faults", ls->v_vm_faults); GETSYSCTL("vm.stats.vm.v_io_faults", ls->v_io_faults); GETSYSCTL("vm.stats.vm.v_cow_faults", ls->v_cow_faults); GETSYSCTL("vm.stats.vm.v_zfod", ls->v_zfod); GETSYSCTL("vm.stats.vm.v_ozfod", ls->v_ozfod); GETSYSCTL("vm.stats.vm.v_swapin", ls->v_swapin); GETSYSCTL("vm.stats.vm.v_swapout", ls->v_swapout); GETSYSCTL("vm.stats.vm.v_swappgsin", ls->v_swappgsin); GETSYSCTL("vm.stats.vm.v_swappgsout", ls->v_swappgsout); GETSYSCTL("vm.stats.vm.v_vnodein", ls->v_vnodein); GETSYSCTL("vm.stats.vm.v_vnodeout", ls->v_vnodeout); GETSYSCTL("vm.stats.vm.v_vnodepgsin", ls->v_vnodepgsin); GETSYSCTL("vm.stats.vm.v_vnodepgsout", ls->v_vnodepgsout); GETSYSCTL("vm.stats.vm.v_intrans", ls->v_intrans); GETSYSCTL("vm.stats.vm.v_reactivated", ls->v_reactivated); GETSYSCTL("vm.stats.vm.v_pdwakeups", ls->v_pdwakeups); GETSYSCTL("vm.stats.vm.v_pdpages", ls->v_pdpages); GETSYSCTL("vm.stats.vm.v_dfree", ls->v_dfree); GETSYSCTL("vm.stats.vm.v_pfree", ls->v_pfree); GETSYSCTL("vm.stats.vm.v_tfree", ls->v_tfree); GETSYSCTL("vm.stats.vm.v_page_size", ls->v_page_size); GETSYSCTL("vm.stats.vm.v_free_count", ls->v_free_count); GETSYSCTL("vm.stats.vm.v_wire_count", ls->v_wire_count); GETSYSCTL("vm.stats.vm.v_active_count", ls->v_active_count); GETSYSCTL("vm.stats.vm.v_inactive_count", ls->v_inactive_count); - GETSYSCTL("vm.stats.vm.v_cache_count", ls->v_cache_count); + GETSYSCTL("vm.stats.vm.v_laundry_count", ls->v_laundry_count); GETSYSCTL("vfs.bufspace", ls->bufspace); GETSYSCTL("kern.maxvnodes", ls->desiredvnodes); GETSYSCTL("vfs.numvnodes", ls->numvnodes); GETSYSCTL("vfs.freevnodes", ls->freevnodes); GETSYSCTL("vfs.cache.nchstats", ls->nchstats); GETSYSCTL("vfs.numdirtybuffers", ls->numdirtybuffers); GETSYSCTL("vm.kmem_map_size", ls->v_kmem_map_size); getsysctl("hw.intrcnt", ls->intrcnt, nintr * sizeof(u_long)); size = sizeof(ls->Total); mib[0] = CTL_VM; mib[1] = VM_TOTAL; if (sysctl(mib, 2, &ls->Total, &size, NULL, 0) < 0) { error("Can't get kernel info: %s\n", strerror(errno)); bzero(&ls->Total, sizeof(ls->Total)); } size = sizeof(ncpu); if (sysctlbyname("hw.ncpu", &ncpu, &size, NULL, 0) < 0 || size != sizeof(ncpu)) ncpu = 1; tmp_dinfo = last.dinfo; last.dinfo = cur.dinfo; cur.dinfo = tmp_dinfo; last.snap_time = cur.snap_time; switch (devstat_getdevs(NULL, &cur)) { case -1: errx(1, "%s", devstat_errbuf); break; case 1: num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; cmdkre("refresh", NULL); break; default: break; } } static void allocinfo(struct Info *ls) { ls->intrcnt = (long *) calloc(nintr, sizeof(long)); if (ls->intrcnt == NULL) errx(2, "out of memory"); } static void copyinfo(struct Info *from, struct Info *to) { long *intrcnt; /* * time, wds, seek, and xfer are malloc'd so we have to * save the pointers before the structure copy and then * copy by hand. */ intrcnt = to->intrcnt; *to = *from; bcopy(from->intrcnt, to->intrcnt = intrcnt, nintr * sizeof (int)); } static void dinfo(int dn, int lc, struct statinfo *now, struct statinfo *then) { long double transfers_per_second; long double kb_per_transfer, mb_per_second; long double elapsed_time, device_busy; int di; di = dev_select[dn].position; if (then != NULL) { /* Calculate relative to previous sample */ elapsed_time = now->snap_time - then->snap_time; } else { /* Calculate relative to device creation */ elapsed_time = now->snap_time - devstat_compute_etime( &now->dinfo->devices[di].creation_time, NULL); } if (devstat_compute_statistics(&now->dinfo->devices[di], then ? &then->dinfo->devices[di] : NULL, elapsed_time, DSM_KB_PER_TRANSFER, &kb_per_transfer, DSM_TRANSFERS_PER_SECOND, &transfers_per_second, DSM_MB_PER_SECOND, &mb_per_second, DSM_BUSY_PCT, &device_busy, DSM_NONE) != 0) errx(1, "%s", devstat_errbuf); lc = DISKCOL + lc * 6; putlongdouble(kb_per_transfer, DISKROW + 1, lc, 5, 2, 0); putlongdouble(transfers_per_second, DISKROW + 2, lc, 5, 0, 0); putlongdouble(mb_per_second, DISKROW + 3, lc, 5, 2, 0); putlongdouble(device_busy, DISKROW + 4, lc, 5, 0, 0); } Index: stable/11/usr.bin/top/machine.c =================================================================== --- stable/11/usr.bin/top/machine.c (revision 318715) +++ stable/11/usr.bin/top/machine.c (revision 318716) @@ -1,1680 +1,1680 @@ /* * top - a top users display for Unix * * SYNOPSIS: For FreeBSD-2.x and later * * DESCRIPTION: * Originally written for BSD4.4 system by Christos Zoulas. * Ported to FreeBSD 2.x by Steven Wallace && Wolfram Schneider * Order support hacked in from top-3.5beta6/machine/m_aix41.c * by Monte Mitzelfelt (for latest top see http://www.groupsys.com/topinfo/) * * This is the machine-dependent module for FreeBSD 2.2 * Works for: * FreeBSD 2.2.x, 3.x, 4.x, and probably FreeBSD 2.1.x * * LIBS: -lkvm * * AUTHOR: Christos Zoulas * Steven Wallace * Wolfram Schneider * Thomas Moestl * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "top.h" #include "machine.h" #include "screen.h" #include "utils.h" #include "layout.h" #define GETSYSCTL(name, var) getsysctl(name, &(var), sizeof(var)) #define SMPUNAMELEN 13 #define UPUNAMELEN 15 extern struct process_select ps; extern char* printable(char *); static int smpmode; enum displaymodes displaymode; #ifdef TOP_USERNAME_LEN static int namelength = TOP_USERNAME_LEN; #else static int namelength = 8; #endif /* TOP_JID_LEN based on max of 999999 */ #define TOP_JID_LEN 7 #define TOP_SWAP_LEN 6 static int jidlength; static int swaplength; static int cmdlengthdelta; /* Prototypes for top internals */ void quit(int); /* get_process_info passes back a handle. This is what it looks like: */ struct handle { struct kinfo_proc **next_proc; /* points to next valid proc pointer */ int remaining; /* number of pointers remaining */ }; /* declarations for load_avg */ #include "loadavg.h" /* define what weighted cpu is. */ #define weighted_cpu(pct, pp) ((pp)->ki_swtime == 0 ? 0.0 : \ ((pct) / (1.0 - exp((pp)->ki_swtime * logcpu)))) /* what we consider to be process size: */ #define PROCSIZE(pp) ((pp)->ki_size / 1024) #define RU(pp) (&(pp)->ki_rusage) #define RUTOT(pp) \ (RU(pp)->ru_inblock + RU(pp)->ru_oublock + RU(pp)->ru_majflt) #define PCTCPU(pp) (pcpu[pp - pbase]) /* definitions for indices in the nlist array */ /* * These definitions control the format of the per-process area */ static char io_header[] = " PID%*s %-*.*s VCSW IVCSW READ WRITE FAULT TOTAL PERCENT COMMAND"; #define io_Proc_format \ "%5d%*s %-*.*s %6ld %6ld %6ld %6ld %6ld %6ld %6.2f%% %.*s" static char smp_header_thr[] = " PID%*s %-*.*s THR PRI NICE SIZE RES%*s STATE C TIME %7s COMMAND"; static char smp_header[] = " PID%*s %-*.*s " "PRI NICE SIZE RES%*s STATE C TIME %7s COMMAND"; #define smp_Proc_format \ "%5d%*s %-*.*s %s%3d %4s%7s %6s%*.*s %-6.6s %2d%7s %6.2f%% %.*s" static char up_header_thr[] = " PID%*s %-*.*s THR PRI NICE SIZE RES%*s STATE TIME %7s COMMAND"; static char up_header[] = " PID%*s %-*.*s " "PRI NICE SIZE RES%*s STATE TIME %7s COMMAND"; #define up_Proc_format \ "%5d%*s %-*.*s %s%3d %4s%7s %6s%*.*s %-6.6s%.0d%7s %6.2f%% %.*s" /* process state names for the "STATE" column of the display */ /* the extra nulls in the string "run" are for adding a slash and the processor number when needed */ char *state_abbrev[] = { "", "START", "RUN\0\0\0", "SLEEP", "STOP", "ZOMB", "WAIT", "LOCK" }; static kvm_t *kd; /* values that we stash away in _init and use in later routines */ static double logcpu; /* these are retrieved from the kernel in _init */ static load_avg ccpu; /* these are used in the get_ functions */ static int lastpid; /* these are for calculating cpu state percentages */ static long cp_time[CPUSTATES]; static long cp_old[CPUSTATES]; static long cp_diff[CPUSTATES]; /* these are for detailing the process states */ int process_states[8]; char *procstatenames[] = { "", " starting, ", " running, ", " sleeping, ", " stopped, ", " zombie, ", " waiting, ", " lock, ", NULL }; /* these are for detailing the cpu states */ int cpu_states[CPUSTATES]; char *cpustatenames[] = { "user", "nice", "system", "interrupt", "idle", NULL }; /* these are for detailing the memory statistics */ int memory_stats[7]; char *memorynames[] = { - "K Active, ", "K Inact, ", "K Wired, ", "K Cache, ", "K Buf, ", + "K Active, ", "K Inact, ", "K Laundry, ", "K Wired, ", "K Buf, ", "K Free", NULL }; int arc_stats[7]; char *arcnames[] = { "K Total, ", "K MFU, ", "K MRU, ", "K Anon, ", "K Header, ", "K Other", NULL }; int carc_stats[5]; char *carcnames[] = { "K Compressed, ", "K Uncompressed, ", ":1 Ratio, ", "K Overhead", NULL }; int swap_stats[7]; char *swapnames[] = { "K Total, ", "K Used, ", "K Free, ", "% Inuse, ", "K In, ", "K Out", NULL }; /* these are for keeping track of the proc array */ static int nproc; static int onproc = -1; static int pref_len; static struct kinfo_proc *pbase; static struct kinfo_proc **pref; static struct kinfo_proc *previous_procs; static struct kinfo_proc **previous_pref; static int previous_proc_count = 0; static int previous_proc_count_max = 0; static int previous_thread; /* data used for recalculating pctcpu */ static double *pcpu; static struct timespec proc_uptime; static struct timeval proc_wall_time; static struct timeval previous_wall_time; static uint64_t previous_interval = 0; /* total number of io operations */ static long total_inblock; static long total_oublock; static long total_majflt; /* these are for getting the memory statistics */ static int arc_enabled; static int carc_enabled; static int pageshift; /* log base 2 of the pagesize */ /* define pagetok in terms of pageshift */ #define pagetok(size) ((size) << pageshift) /* swap usage */ #define ki_swap(kip) \ ((kip)->ki_swrss > (kip)->ki_rssize ? (kip)->ki_swrss - (kip)->ki_rssize : 0) /* useful externals */ long percentages(); #ifdef ORDER /* * Sorting orders. The first element is the default. */ char *ordernames[] = { "cpu", "size", "res", "time", "pri", "threads", "total", "read", "write", "fault", "vcsw", "ivcsw", "jid", "swap", "pid", NULL }; #endif /* Per-cpu time states */ static int maxcpu; static int maxid; static int ncpus; static u_long cpumask; static long *times; static long *pcpu_cp_time; static long *pcpu_cp_old; static long *pcpu_cp_diff; static int *pcpu_cpu_states; static int compare_swap(const void *a, const void *b); static int compare_jid(const void *a, const void *b); static int compare_pid(const void *a, const void *b); static int compare_tid(const void *a, const void *b); static const char *format_nice(const struct kinfo_proc *pp); static void getsysctl(const char *name, void *ptr, size_t len); static int swapmode(int *retavail, int *retfree); static void update_layout(void); void toggle_pcpustats(void) { if (ncpus == 1) return; update_layout(); } /* Adjust display based on ncpus and the ARC state. */ static void update_layout(void) { y_mem = 3; y_arc = 4; y_carc = 5; y_swap = 4 + arc_enabled + carc_enabled; y_idlecursor = 5 + arc_enabled + carc_enabled; y_message = 5 + arc_enabled + carc_enabled; y_header = 6 + arc_enabled + carc_enabled; y_procs = 7 + arc_enabled + carc_enabled; Header_lines = 7 + arc_enabled + carc_enabled; if (pcpu_stats) { y_mem += ncpus - 1; y_arc += ncpus - 1; y_carc += ncpus - 1; y_swap += ncpus - 1; y_idlecursor += ncpus - 1; y_message += ncpus - 1; y_header += ncpus - 1; y_procs += ncpus - 1; Header_lines += ncpus - 1; } } int machine_init(struct statics *statics, char do_unames) { int i, j, empty, pagesize; uint64_t arc_size; boolean_t carc_en; size_t size; struct passwd *pw; size = sizeof(smpmode); if ((sysctlbyname("machdep.smp_active", &smpmode, &size, NULL, 0) != 0 && sysctlbyname("kern.smp.active", &smpmode, &size, NULL, 0) != 0) || size != sizeof(smpmode)) smpmode = 0; size = sizeof(carc_en); if (sysctlbyname("vfs.zfs.compressed_arc_enabled", &carc_en, &size, NULL, 0) == 0 && carc_en == 1) carc_enabled = 1; size = sizeof(arc_size); if (sysctlbyname("kstat.zfs.misc.arcstats.size", &arc_size, &size, NULL, 0) == 0 && arc_size != 0) arc_enabled = 1; if (do_unames) { while ((pw = getpwent()) != NULL) { if (strlen(pw->pw_name) > namelength) namelength = strlen(pw->pw_name); } } if (smpmode && namelength > SMPUNAMELEN) namelength = SMPUNAMELEN; else if (namelength > UPUNAMELEN) namelength = UPUNAMELEN; kd = kvm_open(NULL, _PATH_DEVNULL, NULL, O_RDONLY, "kvm_open"); if (kd == NULL) return (-1); GETSYSCTL("kern.ccpu", ccpu); /* this is used in calculating WCPU -- calculate it ahead of time */ logcpu = log(loaddouble(ccpu)); pbase = NULL; pref = NULL; pcpu = NULL; nproc = 0; onproc = -1; /* get the page size and calculate pageshift from it */ pagesize = getpagesize(); pageshift = 0; while (pagesize > 1) { pageshift++; pagesize >>= 1; } /* we only need the amount of log(2)1024 for our conversion */ pageshift -= LOG1024; /* fill in the statics information */ statics->procstate_names = procstatenames; statics->cpustate_names = cpustatenames; statics->memory_names = memorynames; if (arc_enabled) statics->arc_names = arcnames; else statics->arc_names = NULL; if (carc_enabled) statics->carc_names = carcnames; else statics->carc_names = NULL; statics->swap_names = swapnames; #ifdef ORDER statics->order_names = ordernames; #endif /* Allocate state for per-CPU stats. */ cpumask = 0; ncpus = 0; GETSYSCTL("kern.smp.maxcpus", maxcpu); size = sizeof(long) * maxcpu * CPUSTATES; times = malloc(size); if (times == NULL) err(1, "malloc %zu bytes", size); if (sysctlbyname("kern.cp_times", times, &size, NULL, 0) == -1) err(1, "sysctlbyname kern.cp_times"); pcpu_cp_time = calloc(1, size); maxid = (size / CPUSTATES / sizeof(long)) - 1; for (i = 0; i <= maxid; i++) { empty = 1; for (j = 0; empty && j < CPUSTATES; j++) { if (times[i * CPUSTATES + j] != 0) empty = 0; } if (!empty) { cpumask |= (1ul << i); ncpus++; } } size = sizeof(long) * ncpus * CPUSTATES; pcpu_cp_old = calloc(1, size); pcpu_cp_diff = calloc(1, size); pcpu_cpu_states = calloc(1, size); statics->ncpus = ncpus; update_layout(); /* all done! */ return (0); } char * format_header(char *uname_field) { static char Header[128]; const char *prehead; if (ps.jail) jidlength = TOP_JID_LEN + 1; /* +1 for extra left space. */ else jidlength = 0; if (ps.swap) swaplength = TOP_SWAP_LEN + 1; /* +1 for extra left space */ else swaplength = 0; switch (displaymode) { case DISP_CPU: /* * The logic of picking the right header format seems reverse * here because we only want to display a THR column when * "thread mode" is off (and threads are not listed as * separate lines). */ prehead = smpmode ? (ps.thread ? smp_header : smp_header_thr) : (ps.thread ? up_header : up_header_thr); snprintf(Header, sizeof(Header), prehead, jidlength, ps.jail ? " JID" : "", namelength, namelength, uname_field, swaplength, ps.swap ? " SWAP" : "", ps.wcpu ? "WCPU" : "CPU"); break; case DISP_IO: prehead = io_header; snprintf(Header, sizeof(Header), prehead, jidlength, ps.jail ? " JID" : "", namelength, namelength, uname_field); break; } cmdlengthdelta = strlen(Header) - 7; return (Header); } static int swappgsin = -1; static int swappgsout = -1; extern struct timeval timeout; void get_system_info(struct system_info *si) { long total; struct loadavg sysload; int mib[2]; struct timeval boottime; uint64_t arc_stat, arc_stat2; int i, j; size_t size; /* get the CPU stats */ size = (maxid + 1) * CPUSTATES * sizeof(long); if (sysctlbyname("kern.cp_times", pcpu_cp_time, &size, NULL, 0) == -1) err(1, "sysctlbyname kern.cp_times"); GETSYSCTL("kern.cp_time", cp_time); GETSYSCTL("vm.loadavg", sysload); GETSYSCTL("kern.lastpid", lastpid); /* convert load averages to doubles */ for (i = 0; i < 3; i++) si->load_avg[i] = (double)sysload.ldavg[i] / sysload.fscale; /* convert cp_time counts to percentages */ for (i = j = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; percentages(CPUSTATES, &pcpu_cpu_states[j * CPUSTATES], &pcpu_cp_time[j * CPUSTATES], &pcpu_cp_old[j * CPUSTATES], &pcpu_cp_diff[j * CPUSTATES]); j++; } percentages(CPUSTATES, cpu_states, cp_time, cp_old, cp_diff); /* sum memory & swap statistics */ { static unsigned int swap_delay = 0; static int swapavail = 0; static int swapfree = 0; static long bufspace = 0; static int nspgsin, nspgsout; GETSYSCTL("vfs.bufspace", bufspace); GETSYSCTL("vm.stats.vm.v_active_count", memory_stats[0]); GETSYSCTL("vm.stats.vm.v_inactive_count", memory_stats[1]); - GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[2]); - GETSYSCTL("vm.stats.vm.v_cache_count", memory_stats[3]); + GETSYSCTL("vm.stats.vm.v_laundry_count", memory_stats[2]); + GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[3]); GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[5]); GETSYSCTL("vm.stats.vm.v_swappgsin", nspgsin); GETSYSCTL("vm.stats.vm.v_swappgsout", nspgsout); /* convert memory stats to Kbytes */ memory_stats[0] = pagetok(memory_stats[0]); memory_stats[1] = pagetok(memory_stats[1]); memory_stats[2] = pagetok(memory_stats[2]); memory_stats[3] = pagetok(memory_stats[3]); memory_stats[4] = bufspace / 1024; memory_stats[5] = pagetok(memory_stats[5]); memory_stats[6] = -1; /* first interval */ if (swappgsin < 0) { swap_stats[4] = 0; swap_stats[5] = 0; } /* compute differences between old and new swap statistic */ else { swap_stats[4] = pagetok(((nspgsin - swappgsin))); swap_stats[5] = pagetok(((nspgsout - swappgsout))); } swappgsin = nspgsin; swappgsout = nspgsout; /* call CPU heavy swapmode() only for changes */ if (swap_stats[4] > 0 || swap_stats[5] > 0 || swap_delay == 0) { swap_stats[3] = swapmode(&swapavail, &swapfree); swap_stats[0] = swapavail; swap_stats[1] = swapavail - swapfree; swap_stats[2] = swapfree; } swap_delay = 1; swap_stats[6] = -1; } if (arc_enabled) { GETSYSCTL("kstat.zfs.misc.arcstats.size", arc_stat); arc_stats[0] = arc_stat >> 10; GETSYSCTL("vfs.zfs.mfu_size", arc_stat); arc_stats[1] = arc_stat >> 10; GETSYSCTL("vfs.zfs.mru_size", arc_stat); arc_stats[2] = arc_stat >> 10; GETSYSCTL("vfs.zfs.anon_size", arc_stat); arc_stats[3] = arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.hdr_size", arc_stat); GETSYSCTL("kstat.zfs.misc.arcstats.l2_hdr_size", arc_stat2); arc_stats[4] = arc_stat + arc_stat2 >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.other_size", arc_stat); arc_stats[5] = arc_stat >> 10; si->arc = arc_stats; } if (carc_enabled) { GETSYSCTL("kstat.zfs.misc.arcstats.compressed_size", arc_stat); carc_stats[0] = arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.uncompressed_size", arc_stat); carc_stats[1] = arc_stat >> 10; carc_stats[2] = arc_stats[0]; /* ARC Total */ GETSYSCTL("kstat.zfs.misc.arcstats.overhead_size", arc_stat); carc_stats[3] = arc_stat >> 10; si->carc = carc_stats; } /* set arrays and strings */ if (pcpu_stats) { si->cpustates = pcpu_cpu_states; si->ncpus = ncpus; } else { si->cpustates = cpu_states; si->ncpus = 1; } si->memory = memory_stats; si->swap = swap_stats; if (lastpid > 0) { si->last_pid = lastpid; } else { si->last_pid = -1; } /* * Print how long system has been up. * (Found by looking getting "boottime" from the kernel) */ mib[0] = CTL_KERN; mib[1] = KERN_BOOTTIME; size = sizeof(boottime); if (sysctl(mib, nitems(mib), &boottime, &size, NULL, 0) != -1 && boottime.tv_sec != 0) { si->boottime = boottime; } else { si->boottime.tv_sec = -1; } } #define NOPROC ((void *)-1) /* * We need to compare data from the old process entry with the new * process entry. * To facilitate doing this quickly we stash a pointer in the kinfo_proc * structure to cache the mapping. We also use a negative cache pointer * of NOPROC to avoid duplicate lookups. * XXX: this could be done when the actual processes are fetched, we do * it here out of laziness. */ const struct kinfo_proc * get_old_proc(struct kinfo_proc *pp) { struct kinfo_proc **oldpp, *oldp; /* * If this is the first fetch of the kinfo_procs then we don't have * any previous entries. */ if (previous_proc_count == 0) return (NULL); /* negative cache? */ if (pp->ki_udata == NOPROC) return (NULL); /* cached? */ if (pp->ki_udata != NULL) return (pp->ki_udata); /* * Not cached, * 1) look up based on pid. * 2) compare process start. * If we fail here, then setup a negative cache entry, otherwise * cache it. */ oldpp = bsearch(&pp, previous_pref, previous_proc_count, sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid); if (oldpp == NULL) { pp->ki_udata = NOPROC; return (NULL); } oldp = *oldpp; if (bcmp(&oldp->ki_start, &pp->ki_start, sizeof(pp->ki_start)) != 0) { pp->ki_udata = NOPROC; return (NULL); } pp->ki_udata = oldp; return (oldp); } /* * Return the total amount of IO done in blocks in/out and faults. * store the values individually in the pointers passed in. */ long get_io_stats(struct kinfo_proc *pp, long *inp, long *oup, long *flp, long *vcsw, long *ivcsw) { const struct kinfo_proc *oldp; static struct kinfo_proc dummy; long ret; oldp = get_old_proc(pp); if (oldp == NULL) { bzero(&dummy, sizeof(dummy)); oldp = &dummy; } *inp = RU(pp)->ru_inblock - RU(oldp)->ru_inblock; *oup = RU(pp)->ru_oublock - RU(oldp)->ru_oublock; *flp = RU(pp)->ru_majflt - RU(oldp)->ru_majflt; *vcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw; *ivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw; ret = (RU(pp)->ru_inblock - RU(oldp)->ru_inblock) + (RU(pp)->ru_oublock - RU(oldp)->ru_oublock) + (RU(pp)->ru_majflt - RU(oldp)->ru_majflt); return (ret); } /* * If there was a previous update, use the delta in ki_runtime over * the previous interval to calculate pctcpu. Otherwise, fall back * to using the kernel's ki_pctcpu. */ static double proc_calc_pctcpu(struct kinfo_proc *pp) { const struct kinfo_proc *oldp; if (previous_interval != 0) { oldp = get_old_proc(pp); if (oldp != NULL) return ((double)(pp->ki_runtime - oldp->ki_runtime) / previous_interval); /* * If this process/thread was created during the previous * interval, charge it's total runtime to the previous * interval. */ else if (pp->ki_start.tv_sec > previous_wall_time.tv_sec || (pp->ki_start.tv_sec == previous_wall_time.tv_sec && pp->ki_start.tv_usec >= previous_wall_time.tv_usec)) return ((double)pp->ki_runtime / previous_interval); } return (pctdouble(pp->ki_pctcpu)); } /* * Return true if this process has used any CPU time since the * previous update. */ static int proc_used_cpu(struct kinfo_proc *pp) { const struct kinfo_proc *oldp; oldp = get_old_proc(pp); if (oldp == NULL) return (PCTCPU(pp) != 0); return (pp->ki_runtime != oldp->ki_runtime || RU(pp)->ru_nvcsw != RU(oldp)->ru_nvcsw || RU(pp)->ru_nivcsw != RU(oldp)->ru_nivcsw); } /* * Return the total number of block in/out and faults by a process. */ long get_io_total(struct kinfo_proc *pp) { long dummy; return (get_io_stats(pp, &dummy, &dummy, &dummy, &dummy, &dummy)); } static struct handle handle; caddr_t get_process_info(struct system_info *si, struct process_select *sel, int (*compare)(const void *, const void *)) { int i; int total_procs; long p_io; long p_inblock, p_oublock, p_majflt, p_vcsw, p_ivcsw; long nsec; int active_procs; struct kinfo_proc **prefp; struct kinfo_proc *pp; struct timespec previous_proc_uptime; /* these are copied out of sel for speed */ int show_idle; int show_jid; int show_self; int show_system; int show_uid; int show_command; int show_kidle; /* * If thread state was toggled, don't cache the previous processes. */ if (previous_thread != sel->thread) nproc = 0; previous_thread = sel->thread; /* * Save the previous process info. */ if (previous_proc_count_max < nproc) { free(previous_procs); previous_procs = malloc(nproc * sizeof(*previous_procs)); free(previous_pref); previous_pref = malloc(nproc * sizeof(*previous_pref)); if (previous_procs == NULL || previous_pref == NULL) { (void) fprintf(stderr, "top: Out of memory.\n"); quit(23); } previous_proc_count_max = nproc; } if (nproc) { for (i = 0; i < nproc; i++) previous_pref[i] = &previous_procs[i]; bcopy(pbase, previous_procs, nproc * sizeof(*previous_procs)); qsort(previous_pref, nproc, sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid); } previous_proc_count = nproc; previous_proc_uptime = proc_uptime; previous_wall_time = proc_wall_time; previous_interval = 0; pbase = kvm_getprocs(kd, sel->thread ? KERN_PROC_ALL : KERN_PROC_PROC, 0, &nproc); (void)gettimeofday(&proc_wall_time, NULL); if (clock_gettime(CLOCK_UPTIME, &proc_uptime) != 0) memset(&proc_uptime, 0, sizeof(proc_uptime)); else if (previous_proc_uptime.tv_sec != 0 && previous_proc_uptime.tv_nsec != 0) { previous_interval = (proc_uptime.tv_sec - previous_proc_uptime.tv_sec) * 1000000; nsec = proc_uptime.tv_nsec - previous_proc_uptime.tv_nsec; if (nsec < 0) { previous_interval -= 1000000; nsec += 1000000000; } previous_interval += nsec / 1000; } if (nproc > onproc) { pref = realloc(pref, sizeof(*pref) * nproc); pcpu = realloc(pcpu, sizeof(*pcpu) * nproc); onproc = nproc; } if (pref == NULL || pbase == NULL || pcpu == NULL) { (void) fprintf(stderr, "top: Out of memory.\n"); quit(23); } /* get a pointer to the states summary array */ si->procstates = process_states; /* set up flags which define what we are going to select */ show_idle = sel->idle; show_jid = sel->jid != -1; show_self = sel->self == -1; show_system = sel->system; show_uid = sel->uid != -1; show_command = sel->command != NULL; show_kidle = sel->kidle; /* count up process states and get pointers to interesting procs */ total_procs = 0; active_procs = 0; total_inblock = 0; total_oublock = 0; total_majflt = 0; memset((char *)process_states, 0, sizeof(process_states)); prefp = pref; for (pp = pbase, i = 0; i < nproc; pp++, i++) { if (pp->ki_stat == 0) /* not in use */ continue; if (!show_self && pp->ki_pid == sel->self) /* skip self */ continue; if (!show_system && (pp->ki_flag & P_SYSTEM)) /* skip system process */ continue; p_io = get_io_stats(pp, &p_inblock, &p_oublock, &p_majflt, &p_vcsw, &p_ivcsw); total_inblock += p_inblock; total_oublock += p_oublock; total_majflt += p_majflt; total_procs++; process_states[pp->ki_stat]++; if (pp->ki_stat == SZOMB) /* skip zombies */ continue; if (!show_kidle && pp->ki_tdflags & TDF_IDLETD) /* skip kernel idle process */ continue; PCTCPU(pp) = proc_calc_pctcpu(pp); if (sel->thread && PCTCPU(pp) > 1.0) PCTCPU(pp) = 1.0; if (displaymode == DISP_CPU && !show_idle && (!proc_used_cpu(pp) || pp->ki_stat == SSTOP || pp->ki_stat == SIDL)) /* skip idle or non-running processes */ continue; if (displaymode == DISP_IO && !show_idle && p_io == 0) /* skip processes that aren't doing I/O */ continue; if (show_jid && pp->ki_jid != sel->jid) /* skip proc. that don't belong to the selected JID */ continue; if (show_uid && pp->ki_ruid != (uid_t)sel->uid) /* skip proc. that don't belong to the selected UID */ continue; *prefp++ = pp; active_procs++; } /* if requested, sort the "interesting" processes */ if (compare != NULL) qsort(pref, active_procs, sizeof(*pref), compare); /* remember active and total counts */ si->p_total = total_procs; si->p_active = pref_len = active_procs; /* pass back a handle */ handle.next_proc = pref; handle.remaining = active_procs; return ((caddr_t)&handle); } static char fmt[512]; /* static area where result is built */ char * format_next_process(caddr_t handle, char *(*get_userid)(int), int flags) { struct kinfo_proc *pp; const struct kinfo_proc *oldp; long cputime; double pct; struct handle *hp; char status[16]; int cpu, state; struct rusage ru, *rup; long p_tot, s_tot; char *proc_fmt, thr_buf[6]; char jid_buf[TOP_JID_LEN + 1], swap_buf[TOP_SWAP_LEN + 1]; char *cmdbuf = NULL; char **args; const int cmdlen = 128; /* find and remember the next proc structure */ hp = (struct handle *)handle; pp = *(hp->next_proc++); hp->remaining--; /* get the process's command name */ if ((pp->ki_flag & P_INMEM) == 0) { /* * Print swapped processes as */ size_t len; len = strlen(pp->ki_comm); if (len > sizeof(pp->ki_comm) - 3) len = sizeof(pp->ki_comm) - 3; memmove(pp->ki_comm + 1, pp->ki_comm, len); pp->ki_comm[0] = '<'; pp->ki_comm[len + 1] = '>'; pp->ki_comm[len + 2] = '\0'; } /* * Convert the process's runtime from microseconds to seconds. This * time includes the interrupt time although that is not wanted here. * ps(1) is similarly sloppy. */ cputime = (pp->ki_runtime + 500000) / 1000000; /* calculate the base for cpu percentages */ pct = PCTCPU(pp); /* generate "STATE" field */ switch (state = pp->ki_stat) { case SRUN: if (smpmode && pp->ki_oncpu != NOCPU) sprintf(status, "CPU%d", pp->ki_oncpu); else strcpy(status, "RUN"); break; case SLOCK: if (pp->ki_kiflag & KI_LOCKBLOCK) { sprintf(status, "*%.6s", pp->ki_lockname); break; } /* fall through */ case SSLEEP: if (pp->ki_wmesg != NULL) { sprintf(status, "%.6s", pp->ki_wmesg); break; } /* FALLTHROUGH */ default: if (state >= 0 && state < sizeof(state_abbrev) / sizeof(*state_abbrev)) sprintf(status, "%.6s", state_abbrev[state]); else sprintf(status, "?%5d", state); break; } cmdbuf = (char *)malloc(cmdlen + 1); if (cmdbuf == NULL) { warn("malloc(%d)", cmdlen + 1); return NULL; } if (!(flags & FMT_SHOWARGS)) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) { snprintf(cmdbuf, cmdlen, "%s{%s%s}", pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); } else { snprintf(cmdbuf, cmdlen, "%s", pp->ki_comm); } } else { if (pp->ki_flag & P_SYSTEM || pp->ki_args == NULL || (args = kvm_getargv(kd, pp, cmdlen)) == NULL || !(*args)) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) { snprintf(cmdbuf, cmdlen, "[%s{%s%s}]", pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); } else { snprintf(cmdbuf, cmdlen, "[%s]", pp->ki_comm); } } else { char *src, *dst, *argbuf; char *cmd; size_t argbuflen; size_t len; argbuflen = cmdlen * 4; argbuf = (char *)malloc(argbuflen + 1); if (argbuf == NULL) { warn("malloc(%zu)", argbuflen + 1); free(cmdbuf); return NULL; } dst = argbuf; /* Extract cmd name from argv */ cmd = strrchr(*args, '/'); if (cmd == NULL) cmd = *args; else cmd++; for (; (src = *args++) != NULL; ) { if (*src == '\0') continue; len = (argbuflen - (dst - argbuf) - 1) / 4; strvisx(dst, src, MIN(strlen(src), len), VIS_NL | VIS_CSTYLE); while (*dst != '\0') dst++; if ((argbuflen - (dst - argbuf) - 1) / 4 > 0) *dst++ = ' '; /* add delimiting space */ } if (dst != argbuf && dst[-1] == ' ') dst--; *dst = '\0'; if (strcmp(cmd, pp->ki_comm) != 0) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) snprintf(cmdbuf, cmdlen, "%s (%s){%s%s}", argbuf, pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); else snprintf(cmdbuf, cmdlen, "%s (%s)", argbuf, pp->ki_comm); } else { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) snprintf(cmdbuf, cmdlen, "%s{%s%s}", argbuf, pp->ki_tdname, pp->ki_moretdname); else strlcpy(cmdbuf, argbuf, cmdlen); } free(argbuf); } } if (ps.jail == 0) jid_buf[0] = '\0'; else snprintf(jid_buf, sizeof(jid_buf), "%*d", jidlength - 1, pp->ki_jid); if (ps.swap == 0) swap_buf[0] = '\0'; else snprintf(swap_buf, sizeof(swap_buf), "%*s", swaplength - 1, format_k2(pagetok(ki_swap(pp)))); /* XXX */ if (displaymode == DISP_IO) { oldp = get_old_proc(pp); if (oldp != NULL) { ru.ru_inblock = RU(pp)->ru_inblock - RU(oldp)->ru_inblock; ru.ru_oublock = RU(pp)->ru_oublock - RU(oldp)->ru_oublock; ru.ru_majflt = RU(pp)->ru_majflt - RU(oldp)->ru_majflt; ru.ru_nvcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw; ru.ru_nivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw; rup = &ru; } else { rup = RU(pp); } p_tot = rup->ru_inblock + rup->ru_oublock + rup->ru_majflt; s_tot = total_inblock + total_oublock + total_majflt; snprintf(fmt, sizeof(fmt), io_Proc_format, pp->ki_pid, jidlength, jid_buf, namelength, namelength, (*get_userid)(pp->ki_ruid), rup->ru_nvcsw, rup->ru_nivcsw, rup->ru_inblock, rup->ru_oublock, rup->ru_majflt, p_tot, s_tot == 0 ? 0.0 : (p_tot * 100.0 / s_tot), screen_width > cmdlengthdelta ? screen_width - cmdlengthdelta : 0, printable(cmdbuf)); free(cmdbuf); return (fmt); } /* format this entry */ if (smpmode) { if (state == SRUN && pp->ki_oncpu != NOCPU) cpu = pp->ki_oncpu; else cpu = pp->ki_lastcpu; } else cpu = 0; proc_fmt = smpmode ? smp_Proc_format : up_Proc_format; if (ps.thread != 0) thr_buf[0] = '\0'; else snprintf(thr_buf, sizeof(thr_buf), "%*d ", (int)(sizeof(thr_buf) - 2), pp->ki_numthreads); snprintf(fmt, sizeof(fmt), proc_fmt, pp->ki_pid, jidlength, jid_buf, namelength, namelength, (*get_userid)(pp->ki_ruid), thr_buf, pp->ki_pri.pri_level - PZERO, format_nice(pp), format_k2(PROCSIZE(pp)), format_k2(pagetok(pp->ki_rssize)), swaplength, swaplength, swap_buf, status, cpu, format_time(cputime), ps.wcpu ? 100.0 * weighted_cpu(pct, pp) : 100.0 * pct, screen_width > cmdlengthdelta ? screen_width - cmdlengthdelta : 0, printable(cmdbuf)); free(cmdbuf); /* return the result */ return (fmt); } static void getsysctl(const char *name, void *ptr, size_t len) { size_t nlen = len; if (sysctlbyname(name, ptr, &nlen, NULL, 0) == -1) { fprintf(stderr, "top: sysctl(%s...) failed: %s\n", name, strerror(errno)); quit(23); } if (nlen != len) { fprintf(stderr, "top: sysctl(%s...) expected %lu, got %lu\n", name, (unsigned long)len, (unsigned long)nlen); quit(23); } } static const char * format_nice(const struct kinfo_proc *pp) { const char *fifo, *kproc; int rtpri; static char nicebuf[4 + 1]; fifo = PRI_NEED_RR(pp->ki_pri.pri_class) ? "" : "F"; kproc = (pp->ki_flag & P_KPROC) ? "k" : ""; switch (PRI_BASE(pp->ki_pri.pri_class)) { case PRI_ITHD: return ("-"); case PRI_REALTIME: /* * XXX: the kernel doesn't tell us the original rtprio and * doesn't really know what it was, so to recover it we * must be more chummy with the implementation than the * implementation is with itself. pri_user gives a * constant "base" priority, but is only initialized * properly for user threads. pri_native gives what the * kernel calls the "base" priority, but it isn't constant * since it is changed by priority propagation. pri_native * also isn't properly initialized for all threads, but it * is properly initialized for kernel realtime and idletime * threads. Thus we use pri_user for the base priority of * user threads (it is always correct) and pri_native for * the base priority of kernel realtime and idletime threads * (there is nothing better, and it is usually correct). * * The field width and thus the buffer are too small for * values like "kr31F", but such values shouldn't occur, * and if they do then the tailing "F" is not displayed. */ rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native : pp->ki_pri.pri_user) - PRI_MIN_REALTIME; snprintf(nicebuf, sizeof(nicebuf), "%sr%d%s", kproc, rtpri, fifo); break; case PRI_TIMESHARE: if (pp->ki_flag & P_KPROC) return ("-"); snprintf(nicebuf, sizeof(nicebuf), "%d", pp->ki_nice - NZERO); break; case PRI_IDLE: /* XXX: as above. */ rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native : pp->ki_pri.pri_user) - PRI_MIN_IDLE; snprintf(nicebuf, sizeof(nicebuf), "%si%d%s", kproc, rtpri, fifo); break; default: return ("?"); } return (nicebuf); } /* comparison routines for qsort */ static int compare_pid(const void *p1, const void *p2) { const struct kinfo_proc * const *pp1 = p1; const struct kinfo_proc * const *pp2 = p2; if ((*pp2)->ki_pid < 0 || (*pp1)->ki_pid < 0) abort(); return ((*pp1)->ki_pid - (*pp2)->ki_pid); } static int compare_tid(const void *p1, const void *p2) { const struct kinfo_proc * const *pp1 = p1; const struct kinfo_proc * const *pp2 = p2; if ((*pp2)->ki_tid < 0 || (*pp1)->ki_tid < 0) abort(); return ((*pp1)->ki_tid - (*pp2)->ki_tid); } /* * proc_compare - comparison function for "qsort" * Compares the resource consumption of two processes using five * distinct keys. The keys (in descending order of importance) are: * percent cpu, cpu ticks, state, resident set size, total virtual * memory usage. The process states are ordered as follows (from least * to most important): WAIT, zombie, sleep, stop, start, run. The * array declaration below maps a process state index into a number * that reflects this ordering. */ static int sorted_state[] = { 0, /* not used */ 3, /* sleep */ 1, /* ABANDONED (WAIT) */ 6, /* run */ 5, /* start */ 2, /* zombie */ 4 /* stop */ }; #define ORDERKEY_PCTCPU(a, b) do { \ double diff; \ if (ps.wcpu) \ diff = weighted_cpu(PCTCPU((b)), (b)) - \ weighted_cpu(PCTCPU((a)), (a)); \ else \ diff = PCTCPU((b)) - PCTCPU((a)); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_CPTICKS(a, b) do { \ int64_t diff = (int64_t)(b)->ki_runtime - (int64_t)(a)->ki_runtime; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_STATE(a, b) do { \ int diff = sorted_state[(b)->ki_stat] - sorted_state[(a)->ki_stat]; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_PRIO(a, b) do { \ int diff = (int)(b)->ki_pri.pri_level - (int)(a)->ki_pri.pri_level; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_THREADS(a, b) do { \ int diff = (int)(b)->ki_numthreads - (int)(a)->ki_numthreads; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_RSSIZE(a, b) do { \ long diff = (long)(b)->ki_rssize - (long)(a)->ki_rssize; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_MEM(a, b) do { \ long diff = (long)PROCSIZE((b)) - (long)PROCSIZE((a)); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_JID(a, b) do { \ int diff = (int)(b)->ki_jid - (int)(a)->ki_jid; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_SWAP(a, b) do { \ int diff = (int)ki_swap(b) - (int)ki_swap(a); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) /* compare_cpu - the comparison function for sorting by cpu percentage */ int #ifdef ORDER compare_cpu(void *arg1, void *arg2) #else proc_compare(void *arg1, void *arg2) #endif { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } #ifdef ORDER /* "cpu" compare routines */ int compare_size(), compare_res(), compare_time(), compare_prio(), compare_threads(); /* * "io" compare routines. Context switches aren't i/o, but are displayed * on the "io" display. */ int compare_iototal(), compare_ioread(), compare_iowrite(), compare_iofault(), compare_vcsw(), compare_ivcsw(); int (*compares[])() = { compare_cpu, compare_size, compare_res, compare_time, compare_prio, compare_threads, compare_iototal, compare_ioread, compare_iowrite, compare_iofault, compare_vcsw, compare_ivcsw, compare_jid, compare_swap, NULL }; /* compare_size - the comparison function for sorting by total memory usage */ int compare_size(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_MEM(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); return (0); } /* compare_res - the comparison function for sorting by resident set size */ int compare_res(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); return (0); } /* compare_time - the comparison function for sorting by total cpu time */ int compare_time(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_CPTICKS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_prio - the comparison function for sorting by priority */ int compare_prio(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_PRIO(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_threads - the comparison function for sorting by threads */ int compare_threads(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_THREADS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_jid - the comparison function for sorting by jid */ static int compare_jid(const void *arg1, const void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_JID(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_swap - the comparison function for sorting by swap */ static int compare_swap(const void *arg1, const void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; ORDERKEY_SWAP(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } #endif /* ORDER */ /* assorted comparison functions for sorting by i/o */ int #ifdef ORDER compare_iototal(void *arg1, void *arg2) #else io_compare(void *arg1, void *arg2) #endif { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; return (get_io_total(p2) - get_io_total(p1)); } #ifdef ORDER int compare_ioread(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, inp1, inp2; (void) get_io_stats(p1, &inp1, &dummy, &dummy, &dummy, &dummy); (void) get_io_stats(p2, &inp2, &dummy, &dummy, &dummy, &dummy); return (inp2 - inp1); } int compare_iowrite(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, oup1, oup2; (void) get_io_stats(p1, &dummy, &oup1, &dummy, &dummy, &dummy); (void) get_io_stats(p2, &dummy, &oup2, &dummy, &dummy, &dummy); return (oup2 - oup1); } int compare_iofault(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &flp1, &dummy, &dummy); (void) get_io_stats(p2, &dummy, &dummy, &flp2, &dummy, &dummy); return (flp2 - flp1); } int compare_vcsw(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &dummy, &flp1, &dummy); (void) get_io_stats(p2, &dummy, &dummy, &dummy, &flp2, &dummy); return (flp2 - flp1); } int compare_ivcsw(void *arg1, void *arg2) { struct kinfo_proc *p1 = *(struct kinfo_proc **)arg1; struct kinfo_proc *p2 = *(struct kinfo_proc **)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &dummy, &dummy, &flp1); (void) get_io_stats(p2, &dummy, &dummy, &dummy, &dummy, &flp2); return (flp2 - flp1); } #endif /* ORDER */ /* * proc_owner(pid) - returns the uid that owns process "pid", or -1 if * the process does not exist. * It is EXTREMELY IMPORTANT that this function work correctly. * If top runs setuid root (as in SVR4), then this function * is the only thing that stands in the way of a serious * security problem. It validates requests for the "kill" * and "renice" commands. */ int proc_owner(int pid) { int cnt; struct kinfo_proc **prefp; struct kinfo_proc *pp; prefp = pref; cnt = pref_len; while (--cnt >= 0) { pp = *prefp++; if (pp->ki_pid == (pid_t)pid) return ((int)pp->ki_ruid); } return (-1); } static int swapmode(int *retavail, int *retfree) { int n; int pagesize = getpagesize(); struct kvm_swap swapary[1]; *retavail = 0; *retfree = 0; #define CONVERT(v) ((quad_t)(v) * pagesize / 1024) n = kvm_getswapinfo(kd, swapary, 1, 0); if (n < 0 || swapary[0].ksw_total == 0) return (0); *retavail = CONVERT(swapary[0].ksw_total); *retfree = CONVERT(swapary[0].ksw_total - swapary[0].ksw_used); n = (int)(swapary[0].ksw_used * 100.0 / swapary[0].ksw_total); return (n); } Index: stable/11/usr.bin/top/top.local.1 =================================================================== --- stable/11/usr.bin/top/top.local.1 (revision 318715) +++ stable/11/usr.bin/top/top.local.1 (revision 318716) @@ -1,88 +1,87 @@ .\" $FreeBSD$ .SH "FreeBSD NOTES" .SH DESCRIPTION OF MEMORY -Mem: 61M Active, 86M Inact, 22G Wired, 1M Cache, 2M Buf, 102G Free +Mem: 61M Active, 86M Inact, 1M Laundry, 22G Wired, 2M Buf, 102G Free ARC: 15G Total, 9303M MFU, 6155M MRU, 1464K Anon, 98M Header, 35M Other 15G Compressed, 27G Uncompressed, 1.75:1 Ratio, 174M Overhead Swap: 4096M Total, 532M Free, 13% Inuse, 80K In, 104K Out .TP .B K: Kilobyte .TP .B M: Megabyte .TP .B G: Gigabyte .TP .B %: 1/100 .SS Physical Memory Stats .TP .B Active: number of bytes active .TP .B Inact: -number of bytes inactive +number of clean bytes inactive .TP +.B Laundry: +number of dirty bytes queued for laundering +.TP .B Wired: number of bytes wired down, including BIO-level cached file data pages -.TP -.B Cache: -number of clean bytes caching data that are available for -immediate reallocation .TP .B Buf: number of bytes used for BIO-level disk caching .TP .B Free: number of bytes free .SS ZFS ARC Stats These stats are only displayed when the ARC is in use. .TP .B Total: number of wired bytes used for the ZFS ARC .TP .B MRU: number of ARC bytes holding most recently used data .TP .B MFU: number of ARC bytes holding most frequently used data .TP .B Anon: number of ARC bytes holding in flight data .TP .B Header: number of ARC bytes holding headers .TP .B Other: miscellaneous ARC bytes .TP .B Compressed: bytes of memory used by ARC caches .TP .B Uncompressed: bytes of data stored in ARC caches before compression .TP .B Ratio: ratio of uncompressed data to total ARC size .TP .B Overhead: amount of overhead from ARC compression .SS Swap Stats .TP .B Total: total available swap usage .TP .B Free: total free swap usage .TP .B Inuse: swap usage .TP .B In: bytes paged in from swap devices (last interval) .TP .B Out: bytes paged out to swap devices (last interval) Index: stable/11/usr.bin/vmstat/vmstat.c =================================================================== --- stable/11/usr.bin/vmstat/vmstat.c (revision 318715) +++ stable/11/usr.bin/vmstat/vmstat.c (revision 318716) @@ -1,1725 +1,1725 @@ /* * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1986, 1991, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #if 0 #ifndef lint static char sccsid[] = "@(#)vmstat.c 8.1 (Berkeley) 6/6/93"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VMSTAT_XO_VERSION "1" static char da[] = "da"; static struct nlist namelist[] = { #define X_SUM 0 { "_vm_cnt" }, #define X_HZ 1 { "_hz" }, #define X_STATHZ 2 { "_stathz" }, #define X_NCHSTATS 3 { "_nchstats" }, #define X_INTRNAMES 4 { "_intrnames" }, #define X_SINTRNAMES 5 { "_sintrnames" }, #define X_INTRCNT 6 { "_intrcnt" }, #define X_SINTRCNT 7 { "_sintrcnt" }, #ifdef notyet #define X_DEFICIT XXX { "_deficit" }, #define X_REC XXX { "_rectime" }, #define X_PGIN XXX { "_pgintime" }, #define X_XSTATS XXX { "_xstats" }, #define X_END XXX #else #define X_END 8 #endif { "" }, }; static struct statinfo cur, last; static int num_devices, maxshowdevs; static long generation; static struct device_selection *dev_select; static int num_selected; static struct devstat_match *matches; static int num_matches = 0; static int num_devices_specified, num_selections; static long select_generation; static char **specified_devices; static devstat_select_mode select_mode; static struct vmmeter sum, osum; #define VMSTAT_DEFAULT_LINES 20 /* Default number of `winlines'. */ volatile sig_atomic_t wresized; /* Tty resized, when non-zero. */ static int winlines = VMSTAT_DEFAULT_LINES; /* Current number of tty rows. */ static int aflag; static int nflag; static int Pflag; static int hflag; static kvm_t *kd; #define FORKSTAT 0x01 #define INTRSTAT 0x02 #define MEMSTAT 0x04 #define SUMSTAT 0x08 #define TIMESTAT 0x10 #define VMSTAT 0x20 #define ZMEMSTAT 0x40 #define OBJSTAT 0x80 static void cpustats(void); static void pcpustats(int, u_long, int); static void devstats(void); static void doforkst(void); static void dointr(unsigned int, int); static void doobjstat(void); static void dosum(void); static void dovmstat(unsigned int, int); static void domemstat_malloc(void); static void domemstat_zone(void); static void kread(int, void *, size_t); static void kreado(int, void *, size_t, size_t); static char *kgetstr(const char *); static void needhdr(int); static void needresize(int); static void doresize(void); static void printhdr(int, u_long); static void usage(void); static long pct(long, long); static long long getuptime(void); static char **getdrivedata(char **); int main(int argc, char *argv[]) { int c, todo; unsigned int interval; float f; int reps; char *memf, *nlistf; char errbuf[_POSIX2_LINE_MAX]; memf = nlistf = NULL; interval = reps = todo = 0; maxshowdevs = 2; hflag = isatty(1); argc = xo_parse_args(argc, argv); if (argc < 0) return argc; while ((c = getopt(argc, argv, "ac:fhHiM:mN:n:oPp:stw:z")) != -1) { switch (c) { case 'a': aflag++; break; case 'c': reps = atoi(optarg); break; case 'P': Pflag++; break; case 'f': todo |= FORKSTAT; break; case 'h': hflag = 1; break; case 'H': hflag = 0; break; case 'i': todo |= INTRSTAT; break; case 'M': memf = optarg; break; case 'm': todo |= MEMSTAT; break; case 'N': nlistf = optarg; break; case 'n': nflag = 1; maxshowdevs = atoi(optarg); if (maxshowdevs < 0) xo_errx(1, "number of devices %d is < 0", maxshowdevs); break; case 'o': todo |= OBJSTAT; break; case 'p': if (devstat_buildmatch(optarg, &matches, &num_matches) != 0) xo_errx(1, "%s", devstat_errbuf); break; case 's': todo |= SUMSTAT; break; case 't': #ifdef notyet todo |= TIMESTAT; #else xo_errx(EX_USAGE, "sorry, -t is not (re)implemented yet"); #endif break; case 'w': /* Convert to milliseconds. */ f = atof(optarg); interval = f * 1000; break; case 'z': todo |= ZMEMSTAT; break; case '?': default: usage(); } } argc -= optind; argv += optind; xo_set_version(VMSTAT_XO_VERSION); if (todo == 0) todo = VMSTAT; if (memf != NULL) { kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf); if (kd == NULL) xo_errx(1, "kvm_openfiles: %s", errbuf); } retry_nlist: if (kd != NULL && (c = kvm_nlist(kd, namelist)) != 0) { if (c > 0) { int bufsize = 0, len = 0; char *buf, *bp; /* * 'cnt' was renamed to 'vm_cnt'. If 'vm_cnt' is not * found try looking up older 'cnt' symbol. * */ if (namelist[X_SUM].n_type == 0 && strcmp(namelist[X_SUM].n_name, "_vm_cnt") == 0) { namelist[X_SUM].n_name = "_cnt"; goto retry_nlist; } for (c = 0; c < (int)(nitems(namelist)); c++) if (namelist[c].n_type == 0) bufsize += strlen(namelist[c].n_name) + 1; bufsize += len + 1; buf = bp = alloca(bufsize); for (c = 0; c < (int)(nitems(namelist)); c++) if (namelist[c].n_type == 0) { xo_error(" %s", namelist[c].n_name); len = strlen(namelist[c].n_name); *bp++ = ' '; memcpy(bp, namelist[c].n_name, len); bp += len; } *bp = '\0'; xo_error("undefined symbols:\n", buf); } else xo_warnx("kvm_nlist: %s", kvm_geterr(kd)); xo_finish(); exit(1); } if (kd && Pflag) xo_errx(1, "Cannot use -P with crash dumps"); if (todo & VMSTAT) { /* * Make sure that the userland devstat version matches the * kernel devstat version. If not, exit and print a * message informing the user of his mistake. */ if (devstat_checkversion(NULL) < 0) xo_errx(1, "%s", devstat_errbuf); argv = getdrivedata(argv); } if (*argv) { f = atof(*argv); interval = f * 1000; if (*++argv) reps = atoi(*argv); } if (interval) { if (!reps) reps = -1; } else if (reps) interval = 1 * 1000; if (todo & FORKSTAT) doforkst(); if (todo & MEMSTAT) domemstat_malloc(); if (todo & ZMEMSTAT) domemstat_zone(); if (todo & SUMSTAT) dosum(); if (todo & OBJSTAT) doobjstat(); #ifdef notyet if (todo & TIMESTAT) dotimes(); #endif if (todo & INTRSTAT) dointr(interval, reps); if (todo & VMSTAT) dovmstat(interval, reps); xo_finish(); exit(0); } static int mysysctl(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int error; error = sysctlbyname(name, oldp, oldlenp, newp, newlen); if (error != 0 && errno != ENOMEM) xo_err(1, "sysctl(%s)", name); return (error); } static char ** getdrivedata(char **argv) { if ((num_devices = devstat_getnumdevs(NULL)) < 0) xo_errx(1, "%s", devstat_errbuf); cur.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo)); last.dinfo = (struct devinfo *)calloc(1, sizeof(struct devinfo)); if (devstat_getdevs(NULL, &cur) == -1) xo_errx(1, "%s", devstat_errbuf); num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; specified_devices = (char **)malloc(sizeof(char *)); for (num_devices_specified = 0; *argv; ++argv) { if (isdigit(**argv)) break; num_devices_specified++; specified_devices = (char **)realloc(specified_devices, sizeof(char *) * num_devices_specified); specified_devices[num_devices_specified - 1] = *argv; } dev_select = NULL; if (nflag == 0 && maxshowdevs < num_devices_specified) maxshowdevs = num_devices_specified; /* * People are generally only interested in disk statistics when * they're running vmstat. So, that's what we're going to give * them if they don't specify anything by default. We'll also give * them any other random devices in the system so that we get to * maxshowdevs devices, if that many devices exist. If the user * specifies devices on the command line, either through a pattern * match or by naming them explicitly, we will give the user only * those devices. */ if ((num_devices_specified == 0) && (num_matches == 0)) { if (devstat_buildmatch(da, &matches, &num_matches) != 0) xo_errx(1, "%s", devstat_errbuf); select_mode = DS_SELECT_ADD; } else select_mode = DS_SELECT_ONLY; /* * At this point, selectdevs will almost surely indicate that the * device list has changed, so we don't look for return values of 0 * or 1. If we get back -1, though, there is an error. */ if (devstat_selectdevs(&dev_select, &num_selected, &num_selections, &select_generation, generation, cur.dinfo->devices, num_devices, matches, num_matches, specified_devices, num_devices_specified, select_mode, maxshowdevs, 0) == -1) xo_errx(1, "%s", devstat_errbuf); return(argv); } /* Return system uptime in nanoseconds */ static long long getuptime(void) { struct timespec sp; (void)clock_gettime(CLOCK_UPTIME, &sp); return((long long)sp.tv_sec * 1000000000LL + sp.tv_nsec); } static void fill_pcpu(struct pcpu ***pcpup, int* maxcpup) { struct pcpu **pcpu; int maxcpu, i; *pcpup = NULL; if (kd == NULL) return; maxcpu = kvm_getmaxcpu(kd); if (maxcpu < 0) xo_errx(1, "kvm_getmaxcpu: %s", kvm_geterr(kd)); pcpu = calloc(maxcpu, sizeof(struct pcpu *)); if (pcpu == NULL) xo_err(1, "calloc"); for (i = 0; i < maxcpu; i++) { pcpu[i] = kvm_getpcpu(kd, i); if (pcpu[i] == (struct pcpu *)-1) xo_errx(1, "kvm_getpcpu: %s", kvm_geterr(kd)); } *maxcpup = maxcpu; *pcpup = pcpu; } static void free_pcpu(struct pcpu **pcpu, int maxcpu) { int i; for (i = 0; i < maxcpu; i++) free(pcpu[i]); free(pcpu); } static void fill_vmmeter(struct vmmeter *vmmp) { struct pcpu **pcpu; int maxcpu, i; if (kd != NULL) { kread(X_SUM, vmmp, sizeof(*vmmp)); fill_pcpu(&pcpu, &maxcpu); for (i = 0; i < maxcpu; i++) { if (pcpu[i] == NULL) continue; #define ADD_FROM_PCPU(i, name) \ vmmp->name += pcpu[i]->pc_cnt.name ADD_FROM_PCPU(i, v_swtch); ADD_FROM_PCPU(i, v_trap); ADD_FROM_PCPU(i, v_syscall); ADD_FROM_PCPU(i, v_intr); ADD_FROM_PCPU(i, v_soft); ADD_FROM_PCPU(i, v_vm_faults); ADD_FROM_PCPU(i, v_io_faults); ADD_FROM_PCPU(i, v_cow_faults); ADD_FROM_PCPU(i, v_cow_optim); ADD_FROM_PCPU(i, v_zfod); ADD_FROM_PCPU(i, v_ozfod); ADD_FROM_PCPU(i, v_swapin); ADD_FROM_PCPU(i, v_swapout); ADD_FROM_PCPU(i, v_swappgsin); ADD_FROM_PCPU(i, v_swappgsout); ADD_FROM_PCPU(i, v_vnodein); ADD_FROM_PCPU(i, v_vnodeout); ADD_FROM_PCPU(i, v_vnodepgsin); ADD_FROM_PCPU(i, v_vnodepgsout); ADD_FROM_PCPU(i, v_intrans); ADD_FROM_PCPU(i, v_tfree); ADD_FROM_PCPU(i, v_forks); ADD_FROM_PCPU(i, v_vforks); ADD_FROM_PCPU(i, v_rforks); ADD_FROM_PCPU(i, v_kthreads); ADD_FROM_PCPU(i, v_forkpages); ADD_FROM_PCPU(i, v_vforkpages); ADD_FROM_PCPU(i, v_rforkpages); ADD_FROM_PCPU(i, v_kthreadpages); #undef ADD_FROM_PCPU } free_pcpu(pcpu, maxcpu); } else { size_t size = sizeof(unsigned int); #define GET_VM_STATS(cat, name) \ mysysctl("vm.stats." #cat "." #name, &vmmp->name, &size, NULL, 0) /* sys */ GET_VM_STATS(sys, v_swtch); GET_VM_STATS(sys, v_trap); GET_VM_STATS(sys, v_syscall); GET_VM_STATS(sys, v_intr); GET_VM_STATS(sys, v_soft); /* vm */ GET_VM_STATS(vm, v_vm_faults); GET_VM_STATS(vm, v_io_faults); GET_VM_STATS(vm, v_cow_faults); GET_VM_STATS(vm, v_cow_optim); GET_VM_STATS(vm, v_zfod); GET_VM_STATS(vm, v_ozfod); GET_VM_STATS(vm, v_swapin); GET_VM_STATS(vm, v_swapout); GET_VM_STATS(vm, v_swappgsin); GET_VM_STATS(vm, v_swappgsout); GET_VM_STATS(vm, v_vnodein); GET_VM_STATS(vm, v_vnodeout); GET_VM_STATS(vm, v_vnodepgsin); GET_VM_STATS(vm, v_vnodepgsout); GET_VM_STATS(vm, v_intrans); GET_VM_STATS(vm, v_reactivated); GET_VM_STATS(vm, v_pdwakeups); GET_VM_STATS(vm, v_pdpages); - GET_VM_STATS(vm, v_tcached); + GET_VM_STATS(vm, v_pdshortfalls); GET_VM_STATS(vm, v_dfree); GET_VM_STATS(vm, v_pfree); GET_VM_STATS(vm, v_tfree); GET_VM_STATS(vm, v_page_size); GET_VM_STATS(vm, v_page_count); GET_VM_STATS(vm, v_free_reserved); GET_VM_STATS(vm, v_free_target); GET_VM_STATS(vm, v_free_min); GET_VM_STATS(vm, v_free_count); GET_VM_STATS(vm, v_wire_count); GET_VM_STATS(vm, v_active_count); GET_VM_STATS(vm, v_inactive_target); GET_VM_STATS(vm, v_inactive_count); - GET_VM_STATS(vm, v_cache_count); + GET_VM_STATS(vm, v_laundry_count); GET_VM_STATS(vm, v_pageout_free_min); GET_VM_STATS(vm, v_interrupt_free_min); /*GET_VM_STATS(vm, v_free_severe);*/ GET_VM_STATS(vm, v_forks); GET_VM_STATS(vm, v_vforks); GET_VM_STATS(vm, v_rforks); GET_VM_STATS(vm, v_kthreads); GET_VM_STATS(vm, v_forkpages); GET_VM_STATS(vm, v_vforkpages); GET_VM_STATS(vm, v_rforkpages); GET_VM_STATS(vm, v_kthreadpages); #undef GET_VM_STATS } } static void fill_vmtotal(struct vmtotal *vmtp) { if (kd != NULL) { /* XXX fill vmtp */ xo_errx(1, "not implemented"); } else { size_t size = sizeof(*vmtp); mysysctl("vm.vmtotal", vmtp, &size, NULL, 0); if (size != sizeof(*vmtp)) xo_errx(1, "vm.total size mismatch"); } } /* Determine how many cpu columns, and what index they are in kern.cp_times */ static int getcpuinfo(u_long *maskp, int *maxidp) { int maxcpu; int maxid; int ncpus; int i, j; int empty; size_t size; long *times; u_long mask; if (kd != NULL) xo_errx(1, "not implemented"); mask = 0; ncpus = 0; size = sizeof(maxcpu); mysysctl("kern.smp.maxcpus", &maxcpu, &size, NULL, 0); if (size != sizeof(maxcpu)) xo_errx(1, "sysctl kern.smp.maxcpus"); size = sizeof(long) * maxcpu * CPUSTATES; times = malloc(size); if (times == NULL) xo_err(1, "malloc %zd bytes", size); mysysctl("kern.cp_times", times, &size, NULL, 0); maxid = (size / CPUSTATES / sizeof(long)) - 1; for (i = 0; i <= maxid; i++) { empty = 1; for (j = 0; empty && j < CPUSTATES; j++) { if (times[i * CPUSTATES + j] != 0) empty = 0; } if (!empty) { mask |= (1ul << i); ncpus++; } } if (maskp) *maskp = mask; if (maxidp) *maxidp = maxid; return (ncpus); } static void prthuman(const char *name, u_int64_t val, int size) { char buf[10]; int flags; char fmt[128]; snprintf(fmt, sizeof(fmt), "{:%s/%%*s}", name); if (size < 5 || size > 9) xo_errx(1, "doofus"); flags = HN_B | HN_NOSPACE | HN_DECIMAL; humanize_number(buf, size, val, "", HN_AUTOSCALE, flags); xo_attr("value", "%ju", (uintmax_t) val); xo_emit(fmt, size, buf); } static int hz, hdrcnt; static long *cur_cp_times; static long *last_cp_times; static size_t size_cp_times; static void dovmstat(unsigned int interval, int reps) { struct vmtotal total; time_t uptime, halfuptime; struct devinfo *tmp_dinfo; size_t size; int ncpus, maxid; u_long cpumask; int rate_adj; uptime = getuptime() / 1000000000LL; halfuptime = uptime / 2; rate_adj = 1; ncpus = 1; maxid = 0; /* * If the user stops the program (control-Z) and then resumes it, * print out the header again. */ (void)signal(SIGCONT, needhdr); /* * If our standard output is a tty, then install a SIGWINCH handler * and set wresized so that our first iteration through the main * vmstat loop will peek at the terminal's current rows to find out * how many lines can fit in a screenful of output. */ if (isatty(fileno(stdout)) != 0) { wresized = 1; (void)signal(SIGWINCH, needresize); } else { wresized = 0; winlines = VMSTAT_DEFAULT_LINES; } if (kd != NULL) { if (namelist[X_STATHZ].n_type != 0 && namelist[X_STATHZ].n_value != 0) kread(X_STATHZ, &hz, sizeof(hz)); if (!hz) kread(X_HZ, &hz, sizeof(hz)); } else { struct clockinfo clockrate; size = sizeof(clockrate); mysysctl("kern.clockrate", &clockrate, &size, NULL, 0); if (size != sizeof(clockrate)) xo_errx(1, "clockrate size mismatch"); hz = clockrate.hz; } if (Pflag) { ncpus = getcpuinfo(&cpumask, &maxid); size_cp_times = sizeof(long) * (maxid + 1) * CPUSTATES; cur_cp_times = calloc(1, size_cp_times); last_cp_times = calloc(1, size_cp_times); } for (hdrcnt = 1;;) { if (!--hdrcnt) printhdr(maxid, cpumask); if (kd != NULL) { if (kvm_getcptime(kd, cur.cp_time) < 0) xo_errx(1, "kvm_getcptime: %s", kvm_geterr(kd)); } else { size = sizeof(cur.cp_time); mysysctl("kern.cp_time", &cur.cp_time, &size, NULL, 0); if (size != sizeof(cur.cp_time)) xo_errx(1, "cp_time size mismatch"); } if (Pflag) { size = size_cp_times; mysysctl("kern.cp_times", cur_cp_times, &size, NULL, 0); if (size != size_cp_times) xo_errx(1, "cp_times mismatch"); } tmp_dinfo = last.dinfo; last.dinfo = cur.dinfo; cur.dinfo = tmp_dinfo; last.snap_time = cur.snap_time; /* * Here what we want to do is refresh our device stats. * getdevs() returns 1 when the device list has changed. * If the device list has changed, we want to go through * the selection process again, in case a device that we * were previously displaying has gone away. */ switch (devstat_getdevs(NULL, &cur)) { case -1: xo_errx(1, "%s", devstat_errbuf); break; case 1: { int retval; num_devices = cur.dinfo->numdevs; generation = cur.dinfo->generation; retval = devstat_selectdevs(&dev_select, &num_selected, &num_selections, &select_generation, generation, cur.dinfo->devices, num_devices, matches, num_matches, specified_devices, num_devices_specified, select_mode, maxshowdevs, 0); switch (retval) { case -1: xo_errx(1, "%s", devstat_errbuf); break; case 1: printhdr(maxid, cpumask); break; default: break; } } default: break; } fill_vmmeter(&sum); fill_vmtotal(&total); xo_open_container("processes"); xo_emit("{:runnable/%1d} {:waiting/%ld} " "{:swapped-out/%ld}", total.t_rq - 1, total.t_dw + total.t_pw, total.t_sw); xo_close_container("processes"); xo_open_container("memory"); #define vmstat_pgtok(a) ((a) * (sum.v_page_size >> 10)) #define rate(x) (((x) * rate_adj + halfuptime) / uptime) /* round */ if (hflag) { xo_emit(""); prthuman("available-memory", total.t_avm * (u_int64_t)sum.v_page_size, 5); xo_emit(" "); prthuman("free-memory", total.t_free * (u_int64_t)sum.v_page_size, 5); xo_emit(" "); } else { xo_emit(" "); xo_emit("{:available-memory/%7d}", vmstat_pgtok(total.t_avm)); xo_emit(" "); xo_emit("{:free-memory/%7d}", vmstat_pgtok(total.t_free)); } xo_emit("{:total-page-faults/%5lu} ", (unsigned long)rate(sum.v_vm_faults - osum.v_vm_faults)); xo_close_container("memory"); xo_open_container("paging-rates"); xo_emit("{:page-reactivated/%3lu} ", (unsigned long)rate(sum.v_reactivated - osum.v_reactivated)); xo_emit("{:paged-in/%3lu} ", (unsigned long)rate(sum.v_swapin + sum.v_vnodein - (osum.v_swapin + osum.v_vnodein))); xo_emit("{:paged-out/%3lu} ", (unsigned long)rate(sum.v_swapout + sum.v_vnodeout - (osum.v_swapout + osum.v_vnodeout))); xo_emit("{:freed/%5lu} ", (unsigned long)rate(sum.v_tfree - osum.v_tfree)); xo_emit("{:scanned/%4lu} ", (unsigned long)rate(sum.v_pdpages - osum.v_pdpages)); xo_close_container("paging-rates"); devstats(); xo_open_container("fault-rates"); xo_emit("{:interrupts/%4lu} {:system-calls/%5lu} " "{:context-switches/%5u}", (unsigned long)rate(sum.v_intr - osum.v_intr), (unsigned long)rate(sum.v_syscall - osum.v_syscall), (unsigned long)rate(sum.v_swtch - osum.v_swtch)); xo_close_container("fault-rates"); if (Pflag) pcpustats(ncpus, cpumask, maxid); else cpustats(); xo_emit("\n"); xo_flush(); if (reps >= 0 && --reps <= 0) break; osum = sum; uptime = interval; rate_adj = 1000; /* * We round upward to avoid losing low-frequency events * (i.e., >= 1 per interval but < 1 per millisecond). */ if (interval != 1) halfuptime = (uptime + 1) / 2; else halfuptime = 0; (void)usleep(interval * 1000); } } static void printhdr(int maxid, u_long cpumask) { int i, num_shown; num_shown = MIN(num_selected, maxshowdevs); if (hflag) { xo_emit("{T:procs} {T:memory} {T:/page%*s}", 19, ""); } else { xo_emit("{T:procs} {T:memory} {T:/page%*s}", 19, ""); } if (num_shown > 1) xo_emit(" {T:/disks %*s}", num_shown * 4 - 7, ""); else if (num_shown == 1) xo_emit(" {T:disks}"); xo_emit(" {T:faults} "); if (Pflag) { for (i = 0; i <= maxid; i++) { if (cpumask & (1ul << i)) xo_emit(" {T:/cpu%d} ", i); } xo_emit("\n"); } else xo_emit(" {T:cpu}\n"); if (hflag) { xo_emit("{T:r} {T:b} {T:w} {T:avm} {T:fre} {T:flt} {T:re} {T:pi} {T:po} {T:fr} {T:sr} "); } else { xo_emit("{T:r} {T:b} {T:w} {T:avm} {T:fre} {T:flt} {T:re} {T:pi} {T:po} {T:fr} {T:sr} "); } for (i = 0; i < num_devices; i++) if ((dev_select[i].selected) && (dev_select[i].selected <= maxshowdevs)) xo_emit("{T:/%c%c%d} ", dev_select[i].device_name[0], dev_select[i].device_name[1], dev_select[i].unit_number); xo_emit(" {T:in} {T:sy} {T:cs}"); if (Pflag) { for (i = 0; i <= maxid; i++) { if (cpumask & (1ul << i)) xo_emit(" {T:us} {T:sy} {T:id}"); } xo_emit("\n"); } else xo_emit(" {T:us} {T:sy} {T:id}\n"); if (wresized != 0) doresize(); hdrcnt = winlines; } /* * Force a header to be prepended to the next output. */ static void needhdr(int dummy __unused) { hdrcnt = 1; } /* * When the terminal is resized, force an update of the maximum number of rows * printed between each header repetition. Then force a new header to be * prepended to the next output. */ void needresize(int signo) { wresized = 1; hdrcnt = 1; } /* * Update the global `winlines' count of terminal rows. */ void doresize(void) { int status; struct winsize w; for (;;) { status = ioctl(fileno(stdout), TIOCGWINSZ, &w); if (status == -1 && errno == EINTR) continue; else if (status == -1) xo_err(1, "ioctl"); if (w.ws_row > 3) winlines = w.ws_row - 3; else winlines = VMSTAT_DEFAULT_LINES; break; } /* * Inhibit doresize() calls until we are rescheduled by SIGWINCH. */ wresized = 0; } #ifdef notyet static void dotimes(void) { unsigned int pgintime, rectime; kread(X_REC, &rectime, sizeof(rectime)); kread(X_PGIN, &pgintime, sizeof(pgintime)); kread(X_SUM, &sum, sizeof(sum)); xo_emit("{:page-reclaims/%u} {N:reclaims}, " "{:reclaim-time/%u} {N:total time (usec)}\n", sum.v_pgrec, rectime); xo_emit("{L:average}: {:reclaim-average/%u} {N:usec \\/ reclaim}\n", rectime / sum.v_pgrec); xo_emit("\n"); xo_emit("{:page-ins/%u} {N:page ins}, " "{:page-in-time/%u} {N:total time (msec)}\n", sum.v_pgin, pgintime / 10); xo_emit("{L:average}: {:average/%8.1f} {N:msec \\/ page in}\n", pgintime / (sum.v_pgin * 10.0)); } #endif static long pct(long top, long bot) { long ans; if (bot == 0) return(0); ans = (quad_t)top * 100 / bot; return (ans); } #define PCT(top, bot) pct((long)(top), (long)(bot)) static void dosum(void) { struct nchstats lnchstats; long nchtotal; fill_vmmeter(&sum); xo_open_container("summary-statistics"); xo_emit("{:context-switches/%9u} {N:cpu context switches}\n", sum.v_swtch); xo_emit("{:interrupts/%9u} {N:device interrupts}\n", sum.v_intr); xo_emit("{:software-interrupts/%9u} {N:software interrupts}\n", sum.v_soft); xo_emit("{:traps/%9u} {N:traps}\n", sum.v_trap); xo_emit("{:system-calls/%9u} {N:system calls}\n", sum.v_syscall); xo_emit("{:kernel-threads/%9u} {N:kernel threads created}\n", sum.v_kthreads); xo_emit("{:forks/%9u} {N: fork() calls}\n", sum.v_forks); xo_emit("{:vforks/%9u} {N:vfork() calls}\n", sum.v_vforks); xo_emit("{:rforks/%9u} {N:rfork() calls}\n", sum.v_rforks); xo_emit("{:swap-ins/%9u} {N:swap pager pageins}\n", sum.v_swapin); xo_emit("{:swap-in-pages/%9u} {N:swap pager pages paged in}\n", sum.v_swappgsin); xo_emit("{:swap-outs/%9u} {N:swap pager pageouts}\n", sum.v_swapout); xo_emit("{:swap-out-pages/%9u} {N:swap pager pages paged out}\n", sum.v_swappgsout); xo_emit("{:vnode-page-ins/%9u} {N:vnode pager pageins}\n", sum.v_vnodein); xo_emit("{:vnode-page-in-pages/%9u} {N:vnode pager pages paged in}\n", sum.v_vnodepgsin); xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pageouts}\n", sum.v_vnodeout); xo_emit("{:vnode-page-outs/%9u} {N:vnode pager pages paged out}\n", sum.v_vnodepgsout); xo_emit("{:page-daemon-wakeups/%9u} {N:page daemon wakeups}\n", sum.v_pdwakeups); xo_emit("{:page-daemon-pages/%9u} {N:pages examined by the page daemon}\n", sum.v_pdpages); - xo_emit("{:reactivated/%9u} {N:pages reactivated}\n", + xo_emit("{:page-reclamation-shortfalls/%9u} {N:clean page reclamation shortfalls}\n", + sum.v_pdshortfalls); + xo_emit("{:reactivated/%9u} {N:pages reactivated by the page daemon}\n", sum.v_reactivated); xo_emit("{:copy-on-write-faults/%9u} {N:copy-on-write faults}\n", sum.v_cow_faults); xo_emit("{:copy-on-write-optimized-faults/%9u} {N:copy-on-write optimized faults}\n", sum.v_cow_optim); xo_emit("{:zero-fill-pages/%9u} {N:zero fill pages zeroed}\n", sum.v_zfod); xo_emit("{:zero-fill-prezeroed/%9u} {N:zero fill pages prezeroed}\n", sum.v_ozfod); xo_emit("{:intransit-blocking/%9u} {N:intransit blocking page faults}\n", sum.v_intrans); xo_emit("{:total-faults/%9u} {N:total VM faults taken}\n", sum.v_vm_faults); xo_emit("{:faults-requiring-io/%9u} {N:page faults requiring I\\/O}\n", sum.v_io_faults); xo_emit("{:faults-from-thread-creation/%9u} {N:pages affected by kernel thread creation}\n", sum.v_kthreadpages); xo_emit("{:faults-from-fork/%9u} {N:pages affected by fork}()\n", sum.v_forkpages); xo_emit("{:faults-from-vfork/%9u} {N:pages affected by vfork}()\n", sum.v_vforkpages); xo_emit("{:pages-rfork/%9u} {N:pages affected by rfork}()\n", sum.v_rforkpages); - xo_emit("{:pages-total-cached/%9u} {N:pages cached}\n", - sum.v_tcached); xo_emit("{:pages-freed/%9u} {N:pages freed}\n", sum.v_tfree); xo_emit("{:pages-freed-by-daemon/%9u} {N:pages freed by daemon}\n", sum.v_dfree); xo_emit("{:pages-freed-on-exit/%9u} {N:pages freed by exiting processes}\n", sum.v_pfree); xo_emit("{:active-pages/%9u} {N:pages active}\n", sum.v_active_count); xo_emit("{:inactive-pages/%9u} {N:pages inactive}\n", sum.v_inactive_count); - xo_emit("{:vm-cache/%9u} {N:pages in VM cache}\n", - sum.v_cache_count); + xo_emit("{:laundry-pages/%9u} {N:pages in the laundry queue}\n", + sum.v_laundry_count); xo_emit("{:wired-pages/%9u} {N:pages wired down}\n", sum.v_wire_count); xo_emit("{:free-pages/%9u} {N:pages free}\n", sum.v_free_count); xo_emit("{:bytes-per-page/%9u} {N:bytes per page}\n", sum.v_page_size); if (kd != NULL) { kread(X_NCHSTATS, &lnchstats, sizeof(lnchstats)); } else { size_t size = sizeof(lnchstats); mysysctl("vfs.cache.nchstats", &lnchstats, &size, NULL, 0); if (size != sizeof(lnchstats)) xo_errx(1, "vfs.cache.nchstats size mismatch"); } nchtotal = lnchstats.ncs_goodhits + lnchstats.ncs_neghits + lnchstats.ncs_badhits + lnchstats.ncs_falsehits + lnchstats.ncs_miss + lnchstats.ncs_long; xo_emit("{:total-name-lookups/%9ld} {N:total name lookups}\n", nchtotal); xo_emit("{P:/%9s} {N:cache hits} " "({:positive-cache-hits/%ld}% pos + " "{:negative-cache-hits/%ld}% {N:neg}) " "system {:cache-hit-percent/%ld}% per-directory\n", "", PCT(lnchstats.ncs_goodhits, nchtotal), PCT(lnchstats.ncs_neghits, nchtotal), PCT(lnchstats.ncs_pass2, nchtotal)); xo_emit("{P:/%9s} {L:deletions} {:deletions/%ld}%, " "{L:falsehits} {:false-hits/%ld}%, " "{L:toolong} {:too-long/%ld}%\n", "", PCT(lnchstats.ncs_badhits, nchtotal), PCT(lnchstats.ncs_falsehits, nchtotal), PCT(lnchstats.ncs_long, nchtotal)); xo_close_container("summary-statistics"); } static void doforkst(void) { fill_vmmeter(&sum); xo_open_container("fork-statistics"); xo_emit("{:fork/%u} {N:forks}, {:fork-pages/%u} {N:pages}, " "{L:average} {:fork-average/%.2f}\n", sum.v_forks, sum.v_forkpages, sum.v_forks == 0 ? 0.0 : (double)sum.v_forkpages / sum.v_forks); xo_emit("{:vfork/%u} {N:vforks}, {:vfork-pages/%u} {N:pages}, " "{L:average} {:vfork-average/%.2f}\n", sum.v_vforks, sum.v_vforkpages, sum.v_vforks == 0 ? 0.0 : (double)sum.v_vforkpages / sum.v_vforks); xo_emit("{:rfork/%u} {N:rforks}, {:rfork-pages/%u} {N:pages}, " "{L:average} {:rfork-average/%.2f}\n", sum.v_rforks, sum.v_rforkpages, sum.v_rforks == 0 ? 0.0 : (double)sum.v_rforkpages / sum.v_rforks); xo_close_container("fork-statistics"); } static void devstats(void) { int dn, state; long double transfers_per_second; long double busy_seconds; long tmp; for (state = 0; state < CPUSTATES; ++state) { tmp = cur.cp_time[state]; cur.cp_time[state] -= last.cp_time[state]; last.cp_time[state] = tmp; } busy_seconds = cur.snap_time - last.snap_time; xo_open_list("device"); for (dn = 0; dn < num_devices; dn++) { int di; if ((dev_select[dn].selected == 0) || (dev_select[dn].selected > maxshowdevs)) continue; di = dev_select[dn].position; if (devstat_compute_statistics(&cur.dinfo->devices[di], &last.dinfo->devices[di], busy_seconds, DSM_TRANSFERS_PER_SECOND, &transfers_per_second, DSM_NONE) != 0) xo_errx(1, "%s", devstat_errbuf); xo_open_instance("device"); xo_emit("{ekq:name/%c%c%d}{:transfers/%3.0Lf} ", dev_select[dn].device_name[0], dev_select[dn].device_name[1], dev_select[dn].unit_number, transfers_per_second); xo_close_instance("device"); } xo_close_list("device"); } static void percent(const char *name, double pct, int *over) { char buf[10]; char fmt[128]; int l; snprintf(fmt, sizeof(fmt), " {:%s/%%*s}", name); l = snprintf(buf, sizeof(buf), "%.0f", pct); if (l == 1 && *over) { xo_emit(fmt, 1, buf); (*over)--; } else xo_emit(fmt, 2, buf); if (l > 2) (*over)++; } static void cpustats(void) { int state, over; double lpct, total; total = 0; for (state = 0; state < CPUSTATES; ++state) total += cur.cp_time[state]; if (total) lpct = 100.0 / total; else lpct = 0.0; over = 0; xo_open_container("cpu-statistics"); percent("user", (cur.cp_time[CP_USER] + cur.cp_time[CP_NICE]) * lpct, &over); percent("system", (cur.cp_time[CP_SYS] + cur.cp_time[CP_INTR]) * lpct, &over); percent("idle", cur.cp_time[CP_IDLE] * lpct, &over); xo_close_container("cpu-statistics"); } static void pcpustats(int ncpus, u_long cpumask, int maxid) { int state, i; double lpct, total; long tmp; int over; /* devstats does this for cp_time */ for (i = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; for (state = 0; state < CPUSTATES; ++state) { tmp = cur_cp_times[i * CPUSTATES + state]; cur_cp_times[i * CPUSTATES + state] -= last_cp_times[i * CPUSTATES + state]; last_cp_times[i * CPUSTATES + state] = tmp; } } over = 0; xo_open_list("cpu"); for (i = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; xo_open_instance("cpu"); xo_emit("{ke:name/%d}", i); total = 0; for (state = 0; state < CPUSTATES; ++state) total += cur_cp_times[i * CPUSTATES + state]; if (total) lpct = 100.0 / total; else lpct = 0.0; percent("user", (cur_cp_times[i * CPUSTATES + CP_USER] + cur_cp_times[i * CPUSTATES + CP_NICE]) * lpct, &over); percent("system", (cur_cp_times[i * CPUSTATES + CP_SYS] + cur_cp_times[i * CPUSTATES + CP_INTR]) * lpct, &over); percent("idle", cur_cp_times[i * CPUSTATES + CP_IDLE] * lpct, &over); xo_close_instance("cpu"); } xo_close_list("cpu"); } static unsigned int read_intrcnts(unsigned long **intrcnts) { size_t intrcntlen; if (kd != NULL) { kread(X_SINTRCNT, &intrcntlen, sizeof(intrcntlen)); if ((*intrcnts = malloc(intrcntlen)) == NULL) err(1, "malloc()"); kread(X_INTRCNT, *intrcnts, intrcntlen); } else { for (*intrcnts = NULL, intrcntlen = 1024; ; intrcntlen *= 2) { *intrcnts = reallocf(*intrcnts, intrcntlen); if (*intrcnts == NULL) err(1, "reallocf()"); if (mysysctl("hw.intrcnt", *intrcnts, &intrcntlen, NULL, 0) == 0) break; } } return (intrcntlen / sizeof(unsigned long)); } static void print_intrcnts(unsigned long *intrcnts, unsigned long *old_intrcnts, char *intrnames, unsigned int nintr, size_t istrnamlen, long long period_ms) { unsigned long *intrcnt, *old_intrcnt; uint64_t inttotal, old_inttotal, total_count, total_rate; char* intrname; unsigned int i; inttotal = 0; old_inttotal = 0; intrname = intrnames; xo_open_list("interrupt"); for (i = 0, intrcnt=intrcnts, old_intrcnt=old_intrcnts; i < nintr; i++) { if (intrname[0] != '\0' && (*intrcnt != 0 || aflag)) { unsigned long count, rate; count = *intrcnt - *old_intrcnt; rate = (count * 1000 + period_ms / 2) / period_ms; xo_open_instance("interrupt"); xo_emit("{d:name/%-*s}{ket:name/%s} " "{:total/%20lu} {:rate/%10lu}\n", (int)istrnamlen, intrname, intrname, count, rate); xo_close_instance("interrupt"); } intrname += strlen(intrname) + 1; inttotal += *intrcnt++; old_inttotal += *old_intrcnt++; } total_count = inttotal - old_inttotal; total_rate = (total_count * 1000 + period_ms / 2) / period_ms; xo_close_list("interrupt"); xo_emit("{L:/%-*s} {:total-interrupts/%20" PRIu64 "} " "{:total-rate/%10" PRIu64 "}\n", (int)istrnamlen, "Total", total_count, total_rate); } static void dointr(unsigned int interval, int reps) { unsigned long *intrcnts; long long uptime, period_ms; unsigned long *old_intrcnts = NULL; size_t clen, inamlen, istrnamlen; char *intrnames, *intrname; uptime = getuptime(); /* Get the names of each interrupt source */ if (kd != NULL) { kread(X_SINTRNAMES, &inamlen, sizeof(inamlen)); if ((intrnames = malloc(inamlen)) == NULL) xo_err(1, "malloc()"); kread(X_INTRNAMES, intrnames, inamlen); } else { for (intrnames = NULL, inamlen = 1024; ; inamlen *= 2) { if ((intrnames = reallocf(intrnames, inamlen)) == NULL) xo_err(1, "reallocf()"); if (mysysctl("hw.intrnames", intrnames, &inamlen, NULL, 0) == 0) break; } } /* Determine the length of the longest interrupt name */ intrname = intrnames; istrnamlen = strlen("interrupt"); while(*intrname != '\0') { clen = strlen(intrname); if (clen > istrnamlen) istrnamlen = clen; intrname += strlen(intrname) + 1; } xo_emit("{T:/%-*s} {T:/%20s} {T:/%10s}\n", (int)istrnamlen, "interrupt", "total", "rate"); /* * Loop reps times printing differential interrupt counts. If reps is * zero, then run just once, printing total counts */ xo_open_container("interrupt-statistics"); period_ms = uptime / 1000000; while(1) { unsigned int nintr; long long old_uptime; nintr = read_intrcnts(&intrcnts); /* * Initialize old_intrcnts to 0 for the first pass, so * print_intrcnts will print total interrupts since boot */ if (old_intrcnts == NULL) { old_intrcnts = calloc(nintr, sizeof(unsigned long)); if (old_intrcnts == NULL) xo_err(1, "calloc()"); } print_intrcnts(intrcnts, old_intrcnts, intrnames, nintr, istrnamlen, period_ms); xo_flush(); free(old_intrcnts); old_intrcnts = intrcnts; if (reps >= 0 && --reps <= 0) break; usleep(interval * 1000); old_uptime = uptime; uptime = getuptime(); period_ms = (uptime - old_uptime) / 1000000; } xo_close_container("interrupt-statistics"); } static void domemstat_malloc(void) { struct memory_type_list *mtlp; struct memory_type *mtp; int error, first, i; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } if (kd == NULL) { if (memstat_sysctl_malloc(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_malloc: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); return; } } else { if (memstat_kvm_malloc(mtlp, kd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_malloc: %s", kvm_geterr(kd)); else xo_warnx("memstat_kvm_malloc: %s", memstat_strerror(error)); } } xo_open_container("malloc-statistics"); xo_emit("{T:/%13s} {T:/%5s} {T:/%6s} {T:/%7s} {T:/%8s} {T:Size(s)}\n", "Type", "InUse", "MemUse", "HighUse", "Requests"); xo_open_list("memory"); for (mtp = memstat_mtl_first(mtlp); mtp != NULL; mtp = memstat_mtl_next(mtp)) { if (memstat_get_numallocs(mtp) == 0 && memstat_get_count(mtp) == 0) continue; xo_open_instance("memory"); xo_emit("{k:type/%13s/%s} {:in-use/%5" PRIu64 "} " "{:memory-use/%5" PRIu64 "}{U:K} {:high-use/%7s} " "{:requests/%8" PRIu64 "} ", memstat_get_name(mtp), memstat_get_count(mtp), (memstat_get_bytes(mtp) + 1023) / 1024, "-", memstat_get_numallocs(mtp)); first = 1; xo_open_list("size"); for (i = 0; i < 32; i++) { if (memstat_get_sizemask(mtp) & (1 << i)) { if (!first) xo_emit(","); xo_emit("{l:size/%d}", 1 << (i + 4)); first = 0; } } xo_close_list("size"); xo_close_instance("memory"); xo_emit("\n"); } xo_close_list("memory"); xo_close_container("malloc-statistics"); memstat_mtl_free(mtlp); } static void domemstat_zone(void) { struct memory_type_list *mtlp; struct memory_type *mtp; char name[MEMTYPE_MAXNAME + 1]; int error; mtlp = memstat_mtl_alloc(); if (mtlp == NULL) { xo_warn("memstat_mtl_alloc"); return; } if (kd == NULL) { if (memstat_sysctl_uma(mtlp, 0) < 0) { xo_warnx("memstat_sysctl_uma: %s", memstat_strerror(memstat_mtl_geterror(mtlp))); return; } } else { if (memstat_kvm_uma(mtlp, kd) < 0) { error = memstat_mtl_geterror(mtlp); if (error == MEMSTAT_ERROR_KVM) xo_warnx("memstat_kvm_uma: %s", kvm_geterr(kd)); else xo_warnx("memstat_kvm_uma: %s", memstat_strerror(error)); } } xo_open_container("memory-zone-statistics"); xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} " "{T:/%4s} {T:/%4s}\n\n", "ITEM", "SIZE", "LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP"); xo_open_list("zone"); for (mtp = memstat_mtl_first(mtlp); mtp != NULL; mtp = memstat_mtl_next(mtp)) { strlcpy(name, memstat_get_name(mtp), MEMTYPE_MAXNAME); strcat(name, ":"); xo_open_instance("zone"); xo_emit("{d:name/%-20s}{ke:name/%s} {:size/%6" PRIu64 "}, " "{:limit/%6" PRIu64 "},{:used/%8" PRIu64 "}," "{:free/%8" PRIu64 "},{:requests/%8" PRIu64 "}," "{:fail/%4" PRIu64 "},{:sleep/%4" PRIu64 "}\n", name, memstat_get_name(mtp), memstat_get_size(mtp), memstat_get_countlimit(mtp), memstat_get_count(mtp), memstat_get_free(mtp), memstat_get_numallocs(mtp), memstat_get_failures(mtp), memstat_get_sleeps(mtp)); xo_close_instance("zone"); } memstat_mtl_free(mtlp); xo_close_list("zone"); xo_close_container("memory-zone-statistics"); xo_emit("\n"); } static void display_object(struct kinfo_vmobject *kvo) { const char *str; xo_open_instance("object"); xo_emit("{:resident/%5jd} ", (uintmax_t)kvo->kvo_resident); xo_emit("{:active/%5jd} ", (uintmax_t)kvo->kvo_active); xo_emit("{:inactive/%5jd} ", (uintmax_t)kvo->kvo_inactive); xo_emit("{:refcount/%3d} ", kvo->kvo_ref_count); xo_emit("{:shadowcount/%3d} ", kvo->kvo_shadow_count); switch (kvo->kvo_memattr) { #ifdef VM_MEMATTR_UNCACHEABLE case VM_MEMATTR_UNCACHEABLE: str = "UC"; break; #endif #ifdef VM_MEMATTR_WRITE_COMBINING case VM_MEMATTR_WRITE_COMBINING: str = "WC"; break; #endif #ifdef VM_MEMATTR_WRITE_THROUGH case VM_MEMATTR_WRITE_THROUGH: str = "WT"; break; #endif #ifdef VM_MEMATTR_WRITE_PROTECTED case VM_MEMATTR_WRITE_PROTECTED: str = "WP"; break; #endif #ifdef VM_MEMATTR_WRITE_BACK case VM_MEMATTR_WRITE_BACK: str = "WB"; break; #endif #ifdef VM_MEMATTR_WEAK_UNCACHEABLE case VM_MEMATTR_WEAK_UNCACHEABLE: str = "UC-"; break; #endif #ifdef VM_MEMATTR_WB_WA case VM_MEMATTR_WB_WA: str = "WB"; break; #endif #ifdef VM_MEMATTR_NOCACHE case VM_MEMATTR_NOCACHE: str = "NC"; break; #endif #ifdef VM_MEMATTR_DEVICE case VM_MEMATTR_DEVICE: str = "DEV"; break; #endif #ifdef VM_MEMATTR_CACHEABLE case VM_MEMATTR_CACHEABLE: str = "C"; break; #endif #ifdef VM_MEMATTR_PREFETCHABLE case VM_MEMATTR_PREFETCHABLE: str = "PRE"; break; #endif default: str = "??"; break; } xo_emit("{:attribute/%-3s} ", str); switch (kvo->kvo_type) { case KVME_TYPE_NONE: str = "--"; break; case KVME_TYPE_DEFAULT: str = "df"; break; case KVME_TYPE_VNODE: str = "vn"; break; case KVME_TYPE_SWAP: str = "sw"; break; case KVME_TYPE_DEVICE: str = "dv"; break; case KVME_TYPE_PHYS: str = "ph"; break; case KVME_TYPE_DEAD: str = "dd"; break; case KVME_TYPE_SG: str = "sg"; break; case KVME_TYPE_UNKNOWN: default: str = "??"; break; } xo_emit("{:type/%-2s} ", str); xo_emit("{:path/%-s}\n", kvo->kvo_path); xo_close_instance("object"); } static void doobjstat(void) { struct kinfo_vmobject *kvo; int cnt, i; kvo = kinfo_getvmobject(&cnt); if (kvo == NULL) { xo_warn("Failed to fetch VM object list"); return; } xo_emit("{T:RES/%5s} {T:ACT/%5s} {T:INACT/%5s} {T:REF/%3s} {T:SHD/%3s} " "{T:CM/%3s} {T:TP/%2s} {T:PATH/%s}\n"); xo_open_list("object"); for (i = 0; i < cnt; i++) display_object(&kvo[i]); free(kvo); xo_close_list("object"); } /* * kread reads something from the kernel, given its nlist index. */ static void kreado(int nlx, void *addr, size_t size, size_t offset) { const char *sym; if (namelist[nlx].n_type == 0 || namelist[nlx].n_value == 0) { sym = namelist[nlx].n_name; if (*sym == '_') ++sym; xo_errx(1, "symbol %s not defined", sym); } if ((size_t)kvm_read(kd, namelist[nlx].n_value + offset, addr, size) != size) { sym = namelist[nlx].n_name; if (*sym == '_') ++sym; xo_errx(1, "%s: %s", sym, kvm_geterr(kd)); } } static void kread(int nlx, void *addr, size_t size) { kreado(nlx, addr, size, 0); } static char * kgetstr(const char *strp) { int n = 0, size = 1; char *ret = NULL; do { if (size == n + 1) { ret = realloc(ret, size); if (ret == NULL) xo_err(1, "%s: realloc", __func__); size *= 2; } if (kvm_read(kd, (u_long)strp + n, &ret[n], 1) != 1) xo_errx(1, "%s: %s", __func__, kvm_geterr(kd)); } while (ret[n++] != '\0'); return (ret); } static void usage(void) { xo_error("%s%s", "usage: vmstat [-afHhimoPsz] [-M core [-N system]] [-c count] [-n devs]\n", " [-p type,if,pass] [-w wait] [disks] [wait [count]]\n"); xo_finish(); exit(1); } Index: stable/11 =================================================================== --- stable/11 (revision 318715) +++ stable/11 (revision 318716) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r308474,308489,308691,308706,309203,309365,309703,309898,310720