diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index d398be941e84..0433b6dd3d7e 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1,2196 +1,2195 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { /* Fault parameters. */ vm_offset_t vaddr; vm_page_t *m_hold; vm_prot_t fault_type; vm_prot_t prot; int fault_flags; boolean_t wired; /* Control state. */ struct timeval oom_start_time; bool oom_started; int nera; /* Page reference for cow. */ vm_page_t m_cow; /* Current object. */ vm_object_t object; vm_pindex_t pindex; vm_page_t m; /* Top-level map object. */ vm_object_t first_object; vm_pindex_t first_pindex; vm_page_t first_m; /* Map state. */ vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; /* Vnode if locked. */ struct vnode *vp; }; /* * Return codes for internal fault routines. */ enum fault_status { FAULT_SUCCESS = 1, /* Return success to user. */ FAULT_FAILURE, /* Return failure to user. */ FAULT_CONTINUE, /* Continue faulting. */ FAULT_RESTART, /* Restart fault. */ FAULT_OUT_OF_BOUNDS, /* Invalid address for pager. */ FAULT_HARD, /* Performed I/O. */ FAULT_SOFT, /* Found valid page. */ FAULT_PROTECTION_FAILURE, /* Invalid access. */ }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked); static int vm_pfault_oom_attempts = 3; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, &vm_pfault_oom_attempts, 0, "Number of page allocation attempts in page fault handler before it " "triggers OOM handling"); static int vm_pfault_oom_wait = 10; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, &vm_pfault_oom_wait, 0, "Number of seconds to wait for free pages before retrying " "the page fault handler"); static inline void fault_page_release(vm_page_t *mp) { vm_page_t m; m = *mp; if (m != NULL) { /* * We are likely to loop around again and attempt to busy * this page. Deactivating it leaves it available for * pageout while optimizing fault restarts. */ vm_page_deactivate(m); vm_page_xunbusy(m); *mp = NULL; } } static inline void fault_page_free(vm_page_t *mp) { vm_page_t m; m = *mp; if (m != NULL) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_wired(m)) vm_page_free(m); else vm_page_xunbusy(m); *mp = NULL; } } /* * Return true if a vm_pager_get_pages() call is needed in order to check * whether the pager might have a particular page, false if it can be determined * immediately that the pager can not have a copy. For swap objects, this can * be checked quickly. */ static inline bool fault_object_needs_getpages(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); return ((object->flags & OBJ_SWAP) == 0 || !pctrie_is_empty(&object->un_pager.swp.swp_blks)); } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void fault_deallocate(struct faultstate *fs) { fault_page_release(&fs->m_cow); fault_page_release(&fs->m); vm_object_pip_wakeup(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); fault_page_free(&fs->first_m); VM_OBJECT_WUNLOCK(fs->first_object); vm_object_pip_wakeup(fs->first_object); } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void unlock_and_deallocate(struct faultstate *fs) { VM_OBJECT_WUNLOCK(fs->object); fault_deallocate(fs); } static void vm_fault_dirty(struct faultstate *fs, vm_page_t m) { bool need_dirty; if (((fs->prot & VM_PROT_WRITE) == 0 && (fs->fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_PAGE_OBJECT_BUSY_ASSERT(m); need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 && (fs->fault_flags & VM_FAULT_WIRE) == 0) || (fs->fault_flags & VM_FAULT_DIRTY) != 0; vm_object_set_writeable_dirty(m->object); /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also, since the page is now dirty, we can possibly tell * the pager to release any swap backing the page. */ if (need_dirty && vm_page_set_dirty(m) == 0) { /* * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0) vm_page_aflag_set(m, PGA_NOSYNC); else vm_page_aflag_clear(m, PGA_NOSYNC); } } /* * Unlocks fs.first_object and fs.map on success. */ static enum fault_status vm_fault_soft_fast(struct faultstate *fs) { vm_page_t m, m_map; #if VM_NRESERVLEVEL > 0 vm_page_t m_super; int flags; #endif int psind; vm_offset_t vaddr; enum fault_status res; MPASS(fs->vp == NULL); res = FAULT_SUCCESS; vaddr = fs->vaddr; vm_object_busy(fs->first_object); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || !vm_page_all_valid(m)) { res = FAULT_FAILURE; goto out; } m_map = m; psind = 0; #if VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && (m_super = vm_reserv_to_superpage(m)) != NULL && rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & (pagesizes[m_super->psind] - 1)) && !fs->wired && pmap_ps_enabled(fs->map->pmap)) { flags = PS_ALL_VALID; if ((fs->prot & VM_PROT_WRITE) != 0) { /* * Create a superpage mapping allowing write access * only if none of the constituent pages are busy and * all of them are already dirty (except possibly for * the page that was faulted on). */ flags |= PS_NONE_BUSY; if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) flags |= PS_ALL_DIRTY; } if (vm_page_ps_test(m_super, flags, m)) { m_map = m_super; psind = m_super->psind; vaddr = rounddown2(vaddr, pagesizes[psind]); /* Preset the modified bit for dirty superpages. */ if ((flags & PS_ALL_DIRTY) != 0) fs->fault_type |= VM_PROT_WRITE; } } #endif if (pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type | PMAP_ENTER_NOSLEEP | (fs->wired ? PMAP_ENTER_WIRED : 0), psind) != KERN_SUCCESS) { res = FAULT_FAILURE; goto out; } if (fs->m_hold != NULL) { (*fs->m_hold) = m; vm_page_wire(m); } if (psind == 0 && !fs->wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); VM_OBJECT_RUNLOCK(fs->first_object); vm_fault_dirty(fs, m); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; out: vm_object_unbusy(fs->first_object); return (res); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(vm_page_all_valid(m)); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_page_deactivate(m); vm_page_xunbusy(m); } } static enum fault_status vm_fault_populate(struct faultstate *fs) { vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int bdry_idx, i, npages, psind, rv; enum fault_status res; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); res = FAULT_SUCCESS; /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (FAULT_RESTART); return (FAULT_CONTINUE); } if (rv != VM_PAGER_OK) return (FAULT_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (fs->map->timestamp != fs->map_generation) { if (bdry_idx == 0) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); } else { m = vm_page_lookup(fs->first_object, pager_first); if (m != fs->m) vm_page_xunbusy(m); } return (FAULT_RESTART); } /* * The map is unchanged after our last unlock. Process the fault. * * First, the special case of largepage mappings, where * populate only busies the first page in superpage run. */ if (bdry_idx != 0) { KASSERT(PMAP_HAS_LARGEPAGES, ("missing pmap support for large pages")); m = vm_page_lookup(fs->first_object, pager_first); vm_fault_populate_check_page(m); VM_OBJECT_WUNLOCK(fs->first_object); vaddr = fs->entry->start + IDX_TO_OFF(pager_first) - fs->entry->offset; /* assert alignment for entry */ KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0, ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx", (uintmax_t)fs->entry->start, (uintmax_t)pager_first, (uintmax_t)fs->entry->offset, (uintmax_t)vaddr)); KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0, ("unaligned superpage m %p %#jx", m, (uintmax_t)VM_PAGE_TO_PHYS(m))); rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0) | PMAP_ENTER_LARGEPAGE, bdry_idx); VM_OBJECT_WLOCK(fs->first_object); vm_page_xunbusy(m); if (rv != KERN_SUCCESS) { res = FAULT_FAILURE; goto out; } if ((fs->fault_flags & VM_FAULT_WIRE) != 0) { for (i = 0; i < atop(pagesizes[bdry_idx]); i++) vm_page_wire(m + i); } if (fs->m_hold != NULL) { *fs->m_hold = m + (fs->first_pindex - pager_first); vm_page_wire(*fs->m_hold); } goto out; } /* * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || !pmap_ps_enabled(fs->map->pmap) || fs->wired)) psind = 0; npages = atop(pagesizes[psind]); for (i = 0; i < npages; i++) { vm_fault_populate_check_page(&m[i]); vm_fault_dirty(fs, &m[i]); } VM_OBJECT_WUNLOCK(fs->first_object); rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0), psind); /* * pmap_enter() may fail for a superpage mapping if additional * protection policies prevent the full mapping. * For example, this will happen on amd64 if the entire * address range does not share the same userspace protection * key. Revert to single-page mappings if this happens. */ MPASS(rv == KERN_SUCCESS || (psind > 0 && rv == KERN_PROTECTION_FAILURE)); if (__predict_false(psind > 0 && rv == KERN_PROTECTION_FAILURE)) { MPASS(!fs->wired); for (i = 0; i < npages; i++) { rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), &m[i], fs->prot, fs->fault_type, 0); MPASS(rv == KERN_SUCCESS); } } VM_OBJECT_WLOCK(fs->first_object); for (i = 0; i < npages; i++) { if ((fs->fault_flags & VM_FAULT_WIRE) != 0 && m[i].pindex == fs->first_pindex) vm_page_wire(&m[i]); else vm_page_activate(&m[i]); if (fs->m_hold != NULL && m[i].pindex == fs->first_pindex) { (*fs->m_hold) = &m[i]; vm_page_wire(&m[i]); } vm_page_xunbusy(&m[i]); } } out: curthread->td_ru.ru_majflt++; return (res); } static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Control signal to deliver on protection fault"); /* compat definition to keep common code for signal translation */ #define UCODE_PAGEFLT 12 #ifdef T_PAGEFLT _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT"); #endif /* * vm_fault_trap: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, int *signo, int *ucode) { int result; MPASS(signo == NULL || ucode != NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags, NULL); KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE || result == KERN_INVALID_ADDRESS || result == KERN_RESOURCE_SHORTAGE || result == KERN_PROTECTION_FAILURE || result == KERN_OUT_OF_BOUNDS, ("Unexpected Mach error %d from vm_fault()", result)); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND)) ktrfaultend(result); #endif if (result != KERN_SUCCESS && signo != NULL) { switch (result) { case KERN_FAILURE: case KERN_INVALID_ADDRESS: *signo = SIGSEGV; *ucode = SEGV_MAPERR; break; case KERN_RESOURCE_SHORTAGE: *signo = SIGBUS; *ucode = BUS_OOMERR; break; case KERN_OUT_OF_BOUNDS: *signo = SIGBUS; *ucode = BUS_OBJERR; break; case KERN_PROTECTION_FAILURE: if (prot_fault_translation == 0) { /* * Autodetect. This check also covers * the images without the ABI-tag ELF * note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && curproc->p_osrel >= P_OSREL_SIGSEGV) { *signo = SIGSEGV; *ucode = SEGV_ACCERR; } else { *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } } else if (prot_fault_translation == 1) { /* Always compat mode. */ *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } else { /* Always SIGSEGV mode. */ *signo = SIGSEGV; *ucode = SEGV_ACCERR; } break; default: KASSERT(0, ("Unexpected Mach error %d from vm_fault()", result)); break; } } return (result); } static enum fault_status vm_fault_lock_vnode(struct faultstate *fs, bool objlocked) { struct vnode *vp; int error, locked; if (fs->object->type != OBJT_VNODE) return (FAULT_CONTINUE); vp = fs->object->handle; if (vp == fs->vp) { ASSERT_VOP_LOCKED(vp, "saved vnode is not locked"); return (FAULT_CONTINUE); } /* * Perform an unlock in case the desired vnode changed while * the map was unlocked during a retry. */ unlock_vp(fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock while we have * the page exclusive busied or the object's * paging-in-progress count incremented. Otherwise, we could * deadlock. */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT); if (error == 0) { fs->vp = vp; return (FAULT_CONTINUE); } vhold(vp); if (objlocked) unlock_and_deallocate(fs); else fault_deallocate(fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE); vdrop(vp); fs->vp = vp; KASSERT(error == 0, ("vm_fault: vget failed %d", error)); return (FAULT_RESTART); } /* * Calculate the desired readahead. Handle drop-behind. * * Returns the number of readahead blocks to pass to the pager. */ static int vm_fault_readahead(struct faultstate *fs) { int era, nera; u_char behavior; KASSERT(fs->lookup_still_valid, ("map unlocked")); era = fs->entry->read_ahead; behavior = vm_map_entry_behavior(fs->entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (fs->vaddr == fs->entry->next_read) vm_fault_dontneed(fs, fs->vaddr, nera); } else if (fs->vaddr == fs->entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(fs, fs->vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs->entry->read_ahead = nera; } return (nera); } static int vm_fault_lookup(struct faultstate *fs) { int result; KASSERT(!fs->lookup_still_valid, ("vm_fault_lookup: Map already locked.")); result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type | VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object, &fs->first_pindex, &fs->prot, &fs->wired); if (result != KERN_SUCCESS) { unlock_vp(fs); return (result); } fs->map_generation = fs->map->timestamp; if (fs->entry->eflags & MAP_ENTRY_NOFAULT) { panic("%s: fault on nofault entry, addr: %#lx", __func__, (u_long)fs->vaddr); } if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION && fs->entry->wiring_thread != curthread) { vm_map_unlock_read(fs->map); vm_map_lock(fs->map); if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) && (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(fs); fs->entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs->map, 0); } else vm_map_unlock(fs->map); return (KERN_RESOURCE_SHORTAGE); } MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0); if (fs->wired) fs->fault_type = fs->prot | (fs->fault_type & VM_PROT_COPY); else KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0, ("!fs->wired && VM_FAULT_WIRE")); fs->lookup_still_valid = true; return (KERN_SUCCESS); } static int vm_fault_relookup(struct faultstate *fs) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; int result; if (!vm_map_trylock_read(fs->map)) return (KERN_RESTART); fs->lookup_still_valid = true; if (fs->map->timestamp == fs->map_generation) return (KERN_SUCCESS); result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type, &fs->entry, &retry_object, &retry_pindex, &retry_prot, &fs->wired); if (result != KERN_SUCCESS) { /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) return (KERN_RESTART); return (result); } if (retry_object != fs->first_object || retry_pindex != fs->first_pindex) return (KERN_RESTART); /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ fs->prot &= retry_prot; fs->fault_type &= retry_prot; if (fs->prot == 0) return (KERN_RESTART); /* Reassert because wired may have changed. */ KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); return (KERN_SUCCESS); } static void vm_fault_cow(struct faultstate *fs) { bool is_first_object_locked; KASSERT(fs->object != fs->first_object, ("source and target COW objects are identical")); /* * This allows pages to be virtually copied from a backing_object * into the first_object, where the backing object has no other * refs to it, and cannot gain any more refs. Instead of a bcopy, * we just move the page from the backing object to the first * object. Note that we must mark the page dirty in the first * object so that it will go out to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object and no other refs. */ fs->object->shadow_count == 1 && fs->object->ref_count == 1 && /* * No other ways to look the object up */ fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && /* * We don't chase down the shadow chain and we can acquire locks. */ (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && fs->object == fs->first_object->backing_object && VM_OBJECT_TRYWLOCK(fs->object)) { /* * Remove but keep xbusy for replace. fs->m is moved into * fs->first_object and left busy while fs->first_m is * conditionally freed. */ vm_page_remove_xbusy(fs->m); vm_page_replace(fs->m, fs->first_object, fs->first_pindex, fs->first_m); vm_page_dirty(fs->m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs->m, fs->first_object, fs->object, OFF_TO_IDX(fs->first_object->backing_object_offset)); #endif VM_OBJECT_WUNLOCK(fs->object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = fs->m; fs->m = NULL; VM_CNT_INC(v_cow_optim); } else { if (is_first_object_locked) VM_OBJECT_WUNLOCK(fs->first_object); /* * Oh, well, lets copy it. */ pmap_copy_page(fs->m, fs->first_m); vm_page_valid(fs->first_m); if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) { vm_page_wire(fs->first_m); vm_page_unwire(fs->m, PQ_INACTIVE); } /* * Save the cow page to be released after * pmap_enter is complete. */ fs->m_cow = fs->m; fs->m = NULL; /* * Typically, the shadow object is either private to this * address space (OBJ_ONEMAPPING) or its pages are read only. * In the highly unusual case where the pages of a shadow object * are read/write shared between this and other address spaces, * we need to ensure that any pmap-level mappings to the * original, copy-on-write page from the backing object are * removed from those other address spaces. * * The flag check is racy, but this is tolerable: if * OBJ_ONEMAPPING is cleared after the check, the busy state * ensures that new mappings of m_cow can't be created. * pmap_enter() will replace an existing mapping in the current * address space. If OBJ_ONEMAPPING is set after the check, * removing mappings will at worse trigger some unnecessary page * faults. */ vm_page_assert_xbusied(fs->m_cow); if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0) pmap_remove_all(fs->m_cow); } vm_object_pip_wakeup(fs->object); /* * Only use the new page below... */ fs->object = fs->first_object; fs->pindex = fs->first_pindex; fs->m = fs->first_m; VM_CNT_INC(v_cow_faults); curthread->td_cow++; } static bool vm_fault_next(struct faultstate *fs) { vm_object_t next_object; /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs->object == fs->first_object) { fs->first_m = fs->m; fs->m = NULL; } else fault_page_free(&fs->m); /* * Move on to the next object. Lock the next object before * unlocking the current one. */ VM_OBJECT_ASSERT_WLOCKED(fs->object); next_object = fs->object->backing_object; if (next_object == NULL) return (false); MPASS(fs->first_m != NULL); KASSERT(fs->object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs->object != fs->first_object) vm_object_pip_wakeup(fs->object); fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset); VM_OBJECT_WUNLOCK(fs->object); fs->object = next_object; return (true); } static void vm_fault_zerofill(struct faultstate *fs) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs->object != fs->first_object) { vm_object_pip_wakeup(fs->object); fs->object = fs->first_object; fs->pindex = fs->first_pindex; } MPASS(fs->first_m != NULL); MPASS(fs->m == NULL); fs->m = fs->first_m; fs->first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs->m->flags & PG_ZERO) == 0) { pmap_zero_page(fs->m); } else { VM_CNT_INC(v_ozfod); } VM_CNT_INC(v_zfod); vm_page_valid(fs->m); } /* * Initiate page fault after timeout. Returns true if caller should * do vm_waitpfault() after the call. */ static bool vm_fault_allocate_oom(struct faultstate *fs) { struct timeval now; unlock_and_deallocate(fs); if (vm_pfault_oom_attempts < 0) return (true); if (!fs->oom_started) { fs->oom_started = true; getmicrotime(&fs->oom_start_time); return (true); } getmicrotime(&now); timevalsub(&now, &fs->oom_start_time); if (now.tv_sec < vm_pfault_oom_attempts * vm_pfault_oom_wait) return (true); if (bootverbose) printf( "proc %d (%s) failed to alloc page on fault, starting OOM\n", curproc->p_pid, curproc->p_comm); vm_pageout_oom(VM_OOM_MEM_PF); fs->oom_started = false; return (false); } /* * Allocate a page directly or via the object populate method. */ static enum fault_status vm_fault_allocate(struct faultstate *fs) { struct domainset *dset; enum fault_status res; if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) { res = vm_fault_lock_vnode(fs, true); MPASS(res == FAULT_CONTINUE || res == FAULT_RESTART); if (res == FAULT_RESTART) return (res); } if (fs->pindex >= fs->object->size) { unlock_and_deallocate(fs); return (FAULT_OUT_OF_BOUNDS); } if (fs->object == fs->first_object && (fs->first_object->flags & OBJ_POPULATE) != 0 && fs->first_object->shadow_count == 0) { res = vm_fault_populate(fs); switch (res) { case FAULT_SUCCESS: case FAULT_FAILURE: case FAULT_RESTART: unlock_and_deallocate(fs); return (res); case FAULT_CONTINUE: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * If the process has a fatal signal pending, prioritize the allocation * with the expectation that the process will exit shortly and free some * pages. In particular, the signal may have been posted by the page * daemon in an attempt to resolve an out-of-memory condition. * * The unlocked read of the p_flag is harmless. At worst, the P_KILLED * might be not observed here, and allocation fails, causing a restart * and new reading of the p_flag. */ dset = fs->object->domain.dr_policy; if (dset == NULL) dset = curthread->td_domain.dr_policy; if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex); #endif fs->m = vm_page_alloc(fs->object, fs->pindex, P_KILLED(curproc) ? VM_ALLOC_SYSTEM : 0); } if (fs->m == NULL) { if (vm_fault_allocate_oom(fs)) vm_waitpfault(dset, vm_pfault_oom_wait * hz); return (FAULT_RESTART); } fs->oom_started = false; return (FAULT_CONTINUE); } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ static enum fault_status vm_fault_getpages(struct faultstate *fs, int *behindp, int *aheadp) { vm_offset_t e_end, e_start; int ahead, behind, cluster_offset, rv; enum fault_status status; u_char behavior; /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs->entry->start; e_end = fs->entry->end; behavior = vm_map_entry_behavior(fs->entry); /* * If the pager for the current object might have * the page, then determine the number of additional * pages to read and potentially reprioritize * previously read pages for earlier reclamation. * These operations should only be performed once per * page fault. Even if the current pager doesn't * have the page, the number of additional pages to * read will apply to subsequent objects in the * shadow chain. */ if (fs->nera == -1 && !P_KILLED(curproc)) fs->nera = vm_fault_readahead(fs); /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(fs); status = vm_fault_lock_vnode(fs, false); MPASS(status == FAULT_CONTINUE || status == FAULT_RESTART); if (status == FAULT_RESTART) return (status); KASSERT(fs->vp == NULL || !fs->map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (fs->nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (fs->nera > 0) { behind = 0; ahead = fs->nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(fs->vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1); } *behindp = behind; *aheadp = ahead; rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp); if (rv == VM_PAGER_OK) return (FAULT_HARD); if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { VM_OBJECT_WLOCK(fs->object); fault_page_free(&fs->m); unlock_and_deallocate(fs); return (FAULT_OUT_OF_BOUNDS); } KASSERT(rv == VM_PAGER_FAIL, ("%s: unexpected pager error %d", __func__, rv)); return (FAULT_CONTINUE); } /* * Wait/Retry if the page is busy. We have to do this if the page is * either exclusive or shared busy because the vm_pager may be using * read busy for pageouts (and even pageins if it is the vnode pager), * and we could end up trying to pagein and pageout the same page * simultaneously. * * We can theoretically allow the busy case on a read fault if the page * is marked valid, but since such pages are typically already pmap'd, * putting that special case in might be more effort then it is worth. * We cannot under any circumstances mess around with a shared busied * page except, perhaps, to pmap it. */ static void vm_fault_busy_sleep(struct faultstate *fs) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs->m, PGA_REFERENCED); if (fs->object != fs->first_object) { fault_page_release(&fs->first_m); vm_object_pip_wakeup(fs->first_object); } vm_object_pip_wakeup(fs->object); unlock_map(fs); if (fs->m != vm_page_lookup(fs->object, fs->pindex) || !vm_page_busy_sleep(fs->m, "vmpfw", 0)) VM_OBJECT_WUNLOCK(fs->object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs->first_object); } /* * Handle page lookup, populate, allocate, page-in for the current * object. * * The object is locked on entry and will remain locked with a return * code of FAULT_CONTINUE so that fault may follow the shadow chain. * Otherwise, the object will be unlocked upon return. */ static enum fault_status vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp) { enum fault_status res; bool dead; /* * If the object is marked for imminent termination, we retry * here, since the collapse pass has raced with us. Otherwise, * if we see terminally dead object, return fail. */ if ((fs->object->flags & OBJ_DEAD) != 0) { dead = fs->object->type == OBJT_DEAD; unlock_and_deallocate(fs); if (dead) return (FAULT_PROTECTION_FAILURE); pause("vmf_de", 1); return (FAULT_RESTART); } /* * See if the page is resident. */ fs->m = vm_page_lookup(fs->object, fs->pindex); if (fs->m != NULL) { if (!vm_page_tryxbusy(fs->m)) { vm_fault_busy_sleep(fs); return (FAULT_RESTART); } /* * The page is marked busy for other processes and the * pagedaemon. If it is still completely valid we are * done. */ if (vm_page_all_valid(fs->m)) { VM_OBJECT_WUNLOCK(fs->object); return (FAULT_SOFT); } } VM_OBJECT_ASSERT_WLOCKED(fs->object); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. */ if (fs->m == NULL && (fault_object_needs_getpages(fs->object) || fs->object == fs->first_object)) { res = vm_fault_allocate(fs); if (res != FAULT_CONTINUE) return (res); } /* * Default objects have no pager so no exclusive busy exists * to protect this page in the chain. Skip to the next * object without dropping the lock to preserve atomicity of * shadow faults. */ if (fault_object_needs_getpages(fs->object)) { /* * At this point, we have either allocated a new page * or found an existing page that is only partially * valid. * * We hold a reference on the current object and the * page is exclusive busied. The exclusive busy * prevents simultaneous faults and collapses while * the object lock is dropped. */ VM_OBJECT_WUNLOCK(fs->object); res = vm_fault_getpages(fs, behindp, aheadp); if (res == FAULT_CONTINUE) VM_OBJECT_WLOCK(fs->object); } else { res = FAULT_CONTINUE; } return (res); } int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; int ahead, behind, faultcount, rv; enum fault_status res; bool hardfault; VM_CNT_INC(v_vm_faults); if ((curthread->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); fs.vp = NULL; fs.vaddr = vaddr; fs.m_hold = m_hold; fs.fault_flags = fault_flags; fs.map = map; fs.lookup_still_valid = false; fs.oom_started = false; fs.nera = -1; faultcount = 0; hardfault = false; RetryFault: fs.fault_type = fault_type; /* * Find the backing store object and offset into it to begin the * search. */ rv = vm_fault_lookup(&fs); if (rv != KERN_SUCCESS) { if (rv == KERN_RESOURCE_SHORTAGE) goto RetryFault; return (rv); } /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are mapping an existing page from the top-level object. * Under this condition, a read lock on the object suffices, allowing * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 && (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); res = vm_fault_soft_fast(&fs); if (res == FAULT_SUCCESS) return (KERN_SUCCESS); if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.m_cow = fs.m = fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { res = vm_fault_allocate(&fs); switch (res) { case FAULT_RESTART: goto RetryFault; case FAULT_SUCCESS: return (KERN_SUCCESS); case FAULT_FAILURE: return (KERN_FAILURE); case FAULT_OUT_OF_BOUNDS: return (KERN_OUT_OF_BOUNDS); case FAULT_CONTINUE: break; default: panic("vm_fault: Unhandled status %d", res); } } while (TRUE) { KASSERT(fs.m == NULL, ("page still set %p at loop start", fs.m)); res = vm_fault_object(&fs, &behind, &ahead); switch (res) { case FAULT_SOFT: goto found; case FAULT_HARD: faultcount = behind + 1 + ahead; hardfault = true; goto found; case FAULT_RESTART: goto RetryFault; case FAULT_SUCCESS: return (KERN_SUCCESS); case FAULT_FAILURE: return (KERN_FAILURE); case FAULT_OUT_OF_BOUNDS: return (KERN_OUT_OF_BOUNDS); case FAULT_PROTECTION_FAILURE: return (KERN_PROTECTION_FAILURE); case FAULT_CONTINUE: break; default: panic("vm_fault: Unhandled status %d", res); } /* * The page was not found in the current object. Try to * traverse into a backing object or zero fill if none is * found. */ if (vm_fault_next(&fs)) continue; if ((fs.fault_flags & VM_FAULT_NOFILL) != 0) { if (fs.first_object == fs.object) fault_page_free(&fs.first_m); unlock_and_deallocate(&fs); return (KERN_OUT_OF_BOUNDS); } VM_OBJECT_WUNLOCK(fs.object); vm_fault_zerofill(&fs); /* Don't try to prefault neighboring pages. */ faultcount = 1; break; } found: /* * A valid page has been found and exclusively busied. The * object lock must no longer be held. */ vm_page_assert_xbusied(fs.m); VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { vm_fault_cow(&fs); /* * We only try to prefault read-only mappings to the * neighboring pages when this copy-on-write fault is * a hard fault. In other cases, trying to prefault * is typically wasted effort. */ if (faultcount == 0) faultcount = 1; } else { fs.prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { rv = vm_fault_relookup(&fs); if (rv != KERN_SUCCESS) { fault_deallocate(&fs); if (rv == KERN_RESTART) goto RetryFault; return (rv); } } VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ vm_page_assert_xbusied(fs.m); KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); vm_fault_dirty(&fs, fs.m); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 && fs.wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR, false); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fs.fault_flags & VM_FAULT_WIRE) != 0) vm_page_wire(fs.m); else vm_page_activate(fs.m); if (fs.m_hold != NULL) { (*fs.m_hold) = fs.m; vm_page_wire(fs.m); } vm_page_xunbusy(fs.m); fs.m = NULL; /* * Unlock everything, and return */ fault_deallocate(&fs); if (hardfault) { VM_CNT_INC(v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; VM_OBJECT_ASSERT_UNLOCKED(fs->object); first_object = fs->first_object; /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { VM_OBJECT_RLOCK(first_object); size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (!vm_page_all_valid(m) || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. The test for whether the page * is in the inactive queue is racy; in the * worst case we will requeue the page * unnecessarily. */ if (!vm_page_inactive(m)) vm_page_deactivate(m); } } VM_OBJECT_RUNLOCK(first_object); } } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; if (!obj_locked) VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && !fault_object_needs_getpages(lobject) && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); break; } if (vm_page_all_valid(m) && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); if (!vm_map_range_valid(map, addr, end)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. * * If vm_fault_disable_pagefaults() was called, * i.e., TDP_NOFAULTING is set, we must not sleep nor * acquire MD VM locks, which means we must not call * vm_fault(). Some (out of tree) callers mark * too wide a code area with vm_fault_disable_pagefaults() * already, use the VM_PROT_QUICK_NOFAULT flag to request * the proper behaviour explicitly. */ if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && (curthread->td_pflags & TDP_NOFAULTING) != 0) goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) vm_page_unwire(*mp, PQ_INACTIVE); return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new object backing dst_entry with private copy of all * underlying pages. When src_entry is equal to dst_entry, function * implements COW for wired-down map entry. Otherwise, it forks * wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map __unused, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; bool upgrade; upgrade = src_entry == dst_entry; KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ access = prot = dst_entry->protection; if (!upgrade) access &= ~VM_PROT_WRITE; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. * Doesn't actually shadow anything - we copy the pages * directly. */ dst_object = vm_object_allocate_anon(atop(dst_entry->end - dst_entry->start), NULL, NULL, 0); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif dst_object->domain = src_object->domain; dst_object->charge = dst_entry->end - dst_entry->start; dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; } VM_OBJECT_WLOCK(dst_object); if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; - } else if ((dst_object->type == OBJT_DEFAULT || - (dst_object->flags & OBJ_SWAP) != 0) && + } else if ((dst_object->flags & OBJ_SWAP) != 0 && dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object doesn't share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); vm_wait(dst_object); VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); /* * The object lock does not guarantee that "src_m" will * transition from invalid to valid, but it does ensure * that "src_m" will not transition from valid to * invalid. */ dst_m->dirty = dst_m->valid = src_m->valid; VM_OBJECT_RUNLOCK(object); } else { dst_m = src_m; if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0) goto again; if (dst_m->pindex >= dst_object->size) { /* * We are upgrading. Index can occur * out of bounds if the object type is * vnode and the file was truncated. */ vm_page_xunbusy(dst_m); break; } } /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. * * The page can be invalid if the user called * msync(MS_INVALIDATE) or truncated the backing vnode * or shared memory object. In this case, do not * insert it into pmap, but still do the copy so that * all copies of the wired map entry have similar * backing pages. */ if (vm_page_all_valid(dst_m)) { VM_OBJECT_WUNLOCK(dst_object); pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); VM_OBJECT_WLOCK(dst_object); } /* * Mark it no longer busy, and put it on the active list. */ if (upgrade) { if (src_m != dst_m) { vm_page_unwire(src_m, PQ_INACTIVE); vm_page_wire(dst_m); } else { KASSERT(vm_page_wired(dst_m), ("dst_m %p is not wired", dst_m)); } } else { vm_page_activate(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 7e528fae7453..04310e42218f 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1,5373 +1,5365 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a self-adjusting binary search tree of these * entries is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; static uma_zone_t vmspace_zone; static int vmspace_zinit(void *mem, int size, int flags); static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max); static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry); static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg); #endif static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow); static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr); #define ENTRY_CHARGED(e) ((e)->cred != NULL || \ ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. */ #define PROC_VMSPACE_LOCK(p) do { } while (0) #define PROC_VMSPACE_UNLOCK(p) do { } while (0) /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } #ifndef UMA_MD_SMALL_ALLOC /* * Allocate a new slab for kernel map entries. The kernel map may be locked or * unlocked, depending on whether the request is coming from the kernel map or a * submap. This function allocates a virtual address range directly from the * kernel map instead of the kmem_* layer to avoid recursion on the kernel map * lock and also to avoid triggering allocator recursion in the vmem boundary * tag allocator. */ static void * kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) { vm_offset_t addr; int error, locked; *pflag = UMA_SLAB_PRIV; if (!(locked = vm_map_locked(kernel_map))) vm_map_lock(kernel_map); addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes); if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map)) panic("%s: kernel map is exhausted", __func__); error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes, VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); if (error != KERN_SUCCESS) panic("%s: vm_map_insert() failed: %d", __func__, error); if (!locked) vm_map_unlock(kernel_map); error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT | M_USE_RESERVE | (wait & M_ZERO)); if (error == KERN_SUCCESS) { return ((void *)addr); } else { if (!locked) vm_map_lock(kernel_map); vm_map_delete(kernel_map, addr, bytes); if (!locked) vm_map_unlock(kernel_map); return (NULL); } } static void kmapent_free(void *item, vm_size_t size, uint8_t pflag) { vm_offset_t addr; int error __diagused; if ((pflag & UMA_SLAB_PRIV) == 0) /* XXX leaked */ return; addr = (vm_offset_t)item; kmem_unback(kernel_object, addr, size); error = vm_map_remove(kernel_map, addr, addr + size); KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove failed: %d", __func__, error)); } /* * The worst-case upper bound on the number of kernel map entries that may be * created before the zone must be replenished in _vm_map_unlock(). */ #define KMAPENT_RESERVE 1 #endif /* !UMD_MD_SMALL_ALLOC */ /* * vm_map_startup: * * Initialize the vm_map module. Must be called before any other vm_map * routines. * * User map and entry structures are allocated from the general purpose * memory pool. Kernel maps are statically defined. Kernel map entries * require special handling to avoid recursion; see the comments above * kmapent_alloc() and in vm_map_entry_create(). */ void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); /* * Disable the use of per-CPU buckets: map entry allocation is * serialized by the kernel map lock. */ kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOBUCKET); #ifndef UMA_MD_SMALL_ALLOC /* Reserve an extra map entry for use when replenishing the reserve. */ uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1); uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1); uma_zone_set_allocf(kmapentzone, kmapent_alloc); uma_zone_set_freef(kmapentzone, kmapent_free); #endif mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } static int vmspace_zinit(void *mem, int size, int flags) { struct vmspace *vm; vm_map_t map; vm = (struct vmspace *)mem; map = &vm->vm_map; memset(map, 0, sizeof(*map)); mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); PMAP_LOCK_INIT(vmspace_pmap(vm)); return (0); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; KASSERT(vm->vm_map.nentries == 0, ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries)); KASSERT(vm->vm_map.size == 0, ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size)); } #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. */ struct vmspace * vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); if (!pinit(vmspace_pmap(vm))) { uma_zfree(vmspace_zone, vm); return (NULL); } CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); refcount_init(&vm->vm_refcnt, 1); vm->vm_shm = NULL; vm->vm_swrss = 0; vm->vm_tsize = 0; vm->vm_dsize = 0; vm->vm_ssize = 0; vm->vm_taddr = 0; vm->vm_daddr = 0; vm->vm_maxsaddr = 0; return (vm); } #ifdef RACCT static void vmspace_container_reset(struct proc *p) { PROC_LOCK(p); racct_set(p, RACCT_DATA, 0); racct_set(p, RACCT_STACK, 0); racct_set(p, RACCT_RSS, 0); racct_set(p, RACCT_MEMLOCK, 0); racct_set(p, RACCT_VMEM, 0); PROC_UNLOCK(p); } #endif static inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); pmap_release(vmspace_pmap(vm)); vm->vm_map.pmap = NULL; uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmspace_free() called"); if (refcount_release(&vm->vm_refcnt)) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; p->p_vmspace = NULL; PROC_VMSPACE_UNLOCK(p); KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } void vmspace_exit(struct thread *td) { struct vmspace *vm; struct proc *p; bool released; p = td->td_proc; vm = p->p_vmspace; /* * Prepare to release the vmspace reference. The thread that releases * the last reference is responsible for tearing down the vmspace. * However, threads not releasing the final reference must switch to the * kernel's vmspace0 before the decrement so that the subsequent pmap * deactivation does not modify a freed vmspace. */ refcount_acquire(&vmspace0.vm_refcnt); if (!(released = refcount_release_if_last(&vm->vm_refcnt))) { if (p->p_vmspace != &vmspace0) { PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } released = refcount_release(&vm->vm_refcnt); } if (released) { /* * pmap_remove_pages() expects the pmap to be active, so switch * back first if necessary. */ if (p->p_vmspace != vm) { PROC_VMSPACE_LOCK(p); p->p_vmspace = vm; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } pmap_remove_pages(vmspace_pmap(vm)); PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); vmspace_dofree(vm); } #ifdef RACCT if (racct_enable) vmspace_container_reset(p); #endif } /* Acquire reference to vmspace owned by another process. */ struct vmspace * vmspace_acquire_ref(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) { PROC_VMSPACE_UNLOCK(p); return (NULL); } if (vm != p->p_vmspace) { PROC_VMSPACE_UNLOCK(p); vmspace_free(vm); return (NULL); } PROC_VMSPACE_UNLOCK(p); return (vm); } /* * Switch between vmspaces in an AIO kernel process. * * The new vmspace is either the vmspace of a user process obtained * from an active AIO request or the initial vmspace of the AIO kernel * process (when it is idling). Because user processes will block to * drain any active AIO requests before proceeding in exit() or * execve(), the reference count for vmspaces from AIO requests can * never be 0. Similarly, AIO kernel processes hold an extra * reference on their initial vmspace for the life of the process. As * a result, the 'newvm' vmspace always has a non-zero reference * count. This permits an additional reference on 'newvm' to be * acquired via a simple atomic increment rather than the loop in * vmspace_acquire_ref() above. */ void vmspace_switch_aio(struct vmspace *newvm) { struct vmspace *oldvm; /* XXX: Need some way to assert that this is an aio daemon. */ KASSERT(refcount_load(&newvm->vm_refcnt) > 0, ("vmspace_switch_aio: newvm unreferenced")); oldvm = curproc->p_vmspace; if (oldvm == newvm) return; /* * Point to the new address space and refer to it. */ curproc->p_vmspace = newvm; refcount_acquire(&newvm->vm_refcnt); /* Activate the new mapping. */ pmap_activate(curthread); vmspace_free(oldvm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_xlock_(&map->lock, file, line); map->timestamp++; } void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add) { vm_object_t object; struct vnode *vp; bool vp_held; if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0) return; KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with execs")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for text, entry %p", entry)); if ((object->flags & OBJ_ANON) != 0) object = object->handle; else KASSERT(object->backing_object == NULL, ("non-anon object %p shadows", object)); KASSERT(object != NULL, ("No content object for text, entry %p obj %p", entry, entry->object.vm_object)); /* * Mostly, we do not lock the backing object. It is * referenced by the entry we are processing, so it cannot go * away. */ vm_pager_getvp(object, &vp, &vp_held); if (vp != NULL) { if (add) { VOP_SET_TEXT_CHECKED(vp); } else { vn_lock(vp, LK_SHARED | LK_RETRY); VOP_UNSET_TEXT_CHECKED(vp); VOP_UNLOCK(vp); } if (vp_held) vdrop(vp); } } /* * Use a different name for this vm_map_entry field when it's use * is not consistent with its use as part of an ordered search tree. */ #define defer_next right static void vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry, next; vm_object_t object; td = curthread; entry = td->td_map_def_user; td->td_map_def_user = NULL; while (entry != NULL) { next = entry->defer_next; MPASS((entry->eflags & (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)); if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) { /* * Decrement the object's writemappings and * possibly the vnode's v_writecount. */ KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with writecount")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for writecount")); vm_pager_release_writecount(object, entry->start, entry->end); } vm_map_entry_set_vnode_text(entry, false); vm_map_entry_deallocate(entry, FALSE); entry = next; } } #ifdef INVARIANTS static void _vm_map_assert_locked(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_assert_(&map->system_mtx, MA_OWNED, file, line); else sx_assert_(&map->lock, SA_XLOCKED, file, line); } #define VM_MAP_ASSERT_LOCKED(map) \ _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL }; #ifdef DIAGNOSTIC static int enable_vmmap_check = VMMAP_CHECK_UNLOCK; #else static int enable_vmmap_check = VMMAP_CHECK_NONE; #endif SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN, &enable_vmmap_check, 0, "Enable vm map consistency checking"); static void _vm_map_assert_consistent(vm_map_t map, int check); #define VM_MAP_ASSERT_CONSISTENT(map) \ _vm_map_assert_consistent(map, VMMAP_CHECK_ALL) #ifdef DIAGNOSTIC #define VM_MAP_UNLOCK_CONSISTENT(map) do { \ if (map->nupdates > map->nentries) { \ _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \ map->nupdates = 0; \ } \ } while (0) #else #define VM_MAP_UNLOCK_CONSISTENT(map) #endif #else #define VM_MAP_ASSERT_LOCKED(map) #define VM_MAP_ASSERT_CONSISTENT(map) #define VM_MAP_UNLOCK_CONSISTENT(map) #endif /* INVARIANTS */ void _vm_map_unlock(vm_map_t map, const char *file, int line) { VM_MAP_UNLOCK_CONSISTENT(map); if (map->system_map) { #ifndef UMA_MD_SMALL_ALLOC if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) { uma_prealloc(kmapentzone, 1); map->flags &= ~MAP_REPLENISH; } #endif mtx_unlock_flags_(&map->system_mtx, 0, file, line); } else { sx_xunlock_(&map->lock, file, line); vm_map_process_deferred(); } } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_slock_(&map->lock, file, line); } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) { KASSERT((map->flags & MAP_REPLENISH) == 0, ("%s: MAP_REPLENISH leaked", __func__)); mtx_unlock_flags_(&map->system_mtx, 0, file, line); } else { sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); } } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_xlock_(&map->lock, file, line); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_slock_(&map->lock, file, line); return (error == 0); } /* * _vm_map_lock_upgrade: [ internal use only ] * * Tries to upgrade a read (shared) lock on the specified map to a write * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a * non-zero value if the upgrade fails. If the upgrade fails, the map is * returned without a read or write lock held. * * Requires that the map be read locked. */ int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { unsigned int last_timestamp; if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { if (!sx_try_upgrade_(&map->lock, file, line)) { last_timestamp = map->timestamp; sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); /* * If the map's timestamp does not change while the * map is unlocked, then the upgrade succeeds. */ sx_xlock_(&map->lock, file, line); if (last_timestamp != map->timestamp) { sx_xunlock_(&map->lock, file, line); return (1); } } } map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { KASSERT((map->flags & MAP_REPLENISH) == 0, ("%s: MAP_REPLENISH leaked", __func__)); mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { VM_MAP_UNLOCK_CONSISTENT(map); sx_downgrade_(&map->lock, file, line); } } /* * vm_map_locked: * * Returns a non-zero value if the caller holds a write (exclusive) lock * on the specified map and the value "0" otherwise. */ int vm_map_locked(vm_map_t map) { if (map->system_map) return (mtx_owned(&map->system_mtx)); else return (sx_xlocked(&map->lock)); } /* * _vm_map_unlock_and_wait: * * Atomically releases the lock on the specified map and puts the calling * thread to sleep. The calling thread will remain asleep until either * vm_map_wakeup() is performed on the map or the specified timeout is * exceeded. * * WARNING! This function does not perform deferred deallocations of * objects and map entries. Therefore, the calling thread is expected to * reacquire the map lock after reawakening and later perform an ordinary * unlock operation, such as vm_map_unlock(), before completing its * operation on the map. */ int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) { VM_MAP_UNLOCK_CONSISTENT(map); mtx_lock(&map_sleep_mtx); if (map->system_map) { KASSERT((map->flags & MAP_REPLENISH) == 0, ("%s: MAP_REPLENISH leaked", __func__)); mtx_unlock_flags_(&map->system_mtx, 0, file, line); } else { sx_xunlock_(&map->lock, file, line); } return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", timo)); } /* * vm_map_wakeup: * * Awaken any threads that have slept on the map using * vm_map_unlock_and_wait(). */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the map unlock * and the msleep() in _vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } void vm_map_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); map->busy++; } void vm_map_unbusy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); KASSERT(map->busy, ("vm_map_unbusy: not busy")); if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); wakeup(&map->busy); } } void vm_map_wait_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); while (map->busy) { vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); if (map->system_map) msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); else sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); } map->timestamp++; } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. */ static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { map->header.eflags = MAP_ENTRY_HEADER; map->needs_wakeup = FALSE; map->system_map = 0; map->pmap = pmap; map->header.end = min; map->header.start = max; map->flags = 0; map->header.left = map->header.right = &map->header; map->root = NULL; map->timestamp = 0; map->busy = 0; map->anon_loc = 0; #ifdef DIAGNOSTIC map->nupdates = 0; #endif } void vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, pmap, min, max); mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; #ifndef UMA_MD_SMALL_ALLOC if (map == kernel_map) { VM_MAP_ASSERT_LOCKED(map); /* * A new slab of kernel map entries cannot be allocated at this * point because the kernel map has not yet been updated to * reflect the caller's request. Therefore, we allocate a new * map entry, dipping into the reserve if necessary, and set a * flag indicating that the reserve must be replenished before * the map is unlocked. */ new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM); if (new_entry == NULL) { new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM | M_USE_RESERVE); kernel_map->flags |= MAP_REPLENISH; } } else #endif if (map->system_map) { new_entry = uma_zalloc(kmapentzone, M_NOWAIT); } else { new_entry = uma_zalloc(mapentzone, M_WAITOK); } KASSERT(new_entry != NULL, ("vm_map_entry_create: kernel resources exhausted")); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_max_free_{left,right}: * * Compute the size of the largest free gap between two entries, * one the root of a tree and the other the ancestor of that root * that is the least or greatest ancestor found on the search path. */ static inline vm_size_t vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor) { return (root->left != left_ancestor ? root->left->max_free : root->start - left_ancestor->end); } static inline vm_size_t vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor) { return (root->right != right_ancestor ? root->right->max_free : right_ancestor->start - root->end); } /* * vm_map_entry_{pred,succ}: * * Find the {predecessor, successor} of the entry by taking one step * in the appropriate direction and backtracking as much as necessary. * vm_map_entry_succ is defined in vm_map.h. */ static inline vm_map_entry_t vm_map_entry_pred(vm_map_entry_t entry) { vm_map_entry_t prior; prior = entry->left; if (prior->right->start < entry->start) { do prior = prior->right; while (prior->right != entry); } return (prior); } static inline vm_size_t vm_size_max(vm_size_t a, vm_size_t b) { return (a > b ? a : b); } #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \ vm_map_entry_t z; \ vm_size_t max_free; \ \ /* \ * Infer root->right->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look right to find it. \ */ \ y = root->left; \ max_free = root->max_free; \ KASSERT(max_free == vm_size_max( \ vm_map_entry_max_free_left(root, llist), \ vm_map_entry_max_free_right(root, rlist)), \ ("%s: max_free invariant fails", __func__)); \ if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \ max_free = vm_map_entry_max_free_right(root, rlist); \ if (y != llist && (test)) { \ /* Rotate right and make y root. */ \ z = y->right; \ if (z != root) { \ root->left = z; \ y->right = root; \ if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, z->max_free); \ } else if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, root->start - y->end);\ root = y; \ y = root->left; \ } \ /* Copy right->max_free. Put root on rlist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \ ("%s: max_free not copied from right", __func__)); \ root->left = rlist; \ rlist = root; \ root = y != llist ? y : NULL; \ } while (0) #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \ vm_map_entry_t z; \ vm_size_t max_free; \ \ /* \ * Infer root->left->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look left to find it. \ */ \ y = root->right; \ max_free = root->max_free; \ KASSERT(max_free == vm_size_max( \ vm_map_entry_max_free_left(root, llist), \ vm_map_entry_max_free_right(root, rlist)), \ ("%s: max_free invariant fails", __func__)); \ if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \ max_free = vm_map_entry_max_free_left(root, llist); \ if (y != rlist && (test)) { \ /* Rotate left and make y root. */ \ z = y->left; \ if (z != root) { \ root->right = z; \ y->left = root; \ if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, z->max_free); \ } else if (max_free < y->max_free) \ root->max_free = max_free = \ vm_size_max(max_free, y->start - root->end);\ root = y; \ y = root->right; \ } \ /* Copy left->max_free. Put root on llist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \ ("%s: max_free not copied from left", __func__)); \ root->right = llist; \ llist = root; \ root = y != rlist ? y : NULL; \ } while (0) /* * Walk down the tree until we find addr or a gap where addr would go, breaking * off left and right subtrees of nodes less than, or greater than addr. Treat * subtrees with root->max_free < length as empty trees. llist and rlist are * the two sides in reverse order (bottom-up), with llist linked by the right * pointer and rlist linked by the left pointer in the vm_map_entry, and both * lists terminated by &map->header. This function, and the subsequent call to * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address * values in &map->header. */ static __always_inline vm_map_entry_t vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length, vm_map_entry_t *llist, vm_map_entry_t *rlist) { vm_map_entry_t left, right, root, y; left = right = &map->header; root = map->root; while (root != NULL && root->max_free >= length) { KASSERT(left->end <= root->start && root->end <= right->start, ("%s: root not within tree bounds", __func__)); if (addr < root->start) { SPLAY_LEFT_STEP(root, y, left, right, y->max_free >= length && addr < y->start); } else if (addr >= root->end) { SPLAY_RIGHT_STEP(root, y, left, right, y->max_free >= length && addr >= y->end); } else break; } *llist = left; *rlist = right; return (root); } static __always_inline void vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist) { vm_map_entry_t hi, right, y; right = *rlist; hi = root->right == right ? NULL : root->right; if (hi == NULL) return; do SPLAY_LEFT_STEP(hi, y, root, right, true); while (hi != NULL); *rlist = right; } static __always_inline void vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist) { vm_map_entry_t left, lo, y; left = *llist; lo = root->left == left ? NULL : root->left; if (lo == NULL) return; do SPLAY_RIGHT_STEP(lo, y, left, root, true); while (lo != NULL); *llist = left; } static inline void vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b) { vm_map_entry_t tmp; tmp = *b; *b = *a; *a = tmp; } /* * Walk back up the two spines, flip the pointers and set max_free. The * subtrees of the root go at the bottom of llist and rlist. */ static vm_size_t vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist) { do { /* * The max_free values of the children of llist are in * llist->max_free and max_free. Update with the * max value. */ llist->max_free = max_free = vm_size_max(llist->max_free, max_free); vm_map_entry_swap(&llist->right, &tail); vm_map_entry_swap(&tail, &llist); } while (llist != header); root->left = tail; return (max_free); } /* * When llist is known to be the predecessor of root. */ static inline vm_size_t vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t llist) { vm_size_t max_free; max_free = root->start - llist->end; if (llist != header) { max_free = vm_map_splay_merge_left_walk(header, root, root, max_free, llist); } else { root->left = header; header->right = root; } return (max_free); } /* * When llist may or may not be the predecessor of root. */ static inline vm_size_t vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t llist) { vm_size_t max_free; max_free = vm_map_entry_max_free_left(root, llist); if (llist != header) { max_free = vm_map_splay_merge_left_walk(header, root, root->left == llist ? root : root->left, max_free, llist); } return (max_free); } static vm_size_t vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist) { do { /* * The max_free values of the children of rlist are in * rlist->max_free and max_free. Update with the * max value. */ rlist->max_free = max_free = vm_size_max(rlist->max_free, max_free); vm_map_entry_swap(&rlist->left, &tail); vm_map_entry_swap(&tail, &rlist); } while (rlist != header); root->right = tail; return (max_free); } /* * When rlist is known to be the succecessor of root. */ static inline vm_size_t vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t rlist) { vm_size_t max_free; max_free = rlist->start - root->end; if (rlist != header) { max_free = vm_map_splay_merge_right_walk(header, root, root, max_free, rlist); } else { root->right = header; header->left = root; } return (max_free); } /* * When rlist may or may not be the succecessor of root. */ static inline vm_size_t vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root, vm_map_entry_t rlist) { vm_size_t max_free; max_free = vm_map_entry_max_free_right(root, rlist); if (rlist != header) { max_free = vm_map_splay_merge_right_walk(header, root, root->right == rlist ? root : root->right, max_free, rlist); } return (max_free); } /* * vm_map_splay: * * The Sleator and Tarjan top-down splay algorithm with the * following variation. Max_free must be computed bottom-up, so * on the downward pass, maintain the left and right spines in * reverse order. Then, make a second pass up each side to fix * the pointers and compute max_free. The time bound is O(log n) * amortized. * * The tree is threaded, which means that there are no null pointers. * When a node has no left child, its left pointer points to its * predecessor, which the last ancestor on the search path from the root * where the search branched right. Likewise, when a node has no right * child, its right pointer points to its successor. The map header node * is the predecessor of the first map entry, and the successor of the * last. * * The new root is the vm_map_entry containing "addr", or else an * adjacent entry (lower if possible) if addr is not in the tree. * * The map must be locked, and leaves it so. * * Returns: the new root. */ static vm_map_entry_t vm_map_splay(vm_map_t map, vm_offset_t addr) { vm_map_entry_t header, llist, rlist, root; vm_size_t max_free_left, max_free_right; header = &map->header; root = vm_map_splay_split(map, addr, 0, &llist, &rlist); if (root != NULL) { max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else if (llist != header) { /* * Recover the greatest node in the left * subtree and make it the root. */ root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_succ(header, root, rlist); } else if (rlist != header) { /* * Recover the least node in the right * subtree and make it the root. */ root = rlist; rlist = root->left; max_free_left = vm_map_splay_merge_pred(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else { /* There is no root. */ return (NULL); } root->max_free = vm_size_max(max_free_left, max_free_right); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. On linking, if new entry clips * existing entry, trim existing entry to avoid overlap, and manage * offsets. On unlinking, merge disappearing entry with neighbor, if * called for, and manage offsets. Callers should not modify fields in * entries already mapped. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t header, llist, rlist, root; vm_size_t max_free_left, max_free_right; CTR3(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p", map, map->nentries, entry); VM_MAP_ASSERT_LOCKED(map); map->nentries++; header = &map->header; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); if (root == NULL) { /* * The new entry does not overlap any existing entry in the * map, so it becomes the new root of the map tree. */ max_free_left = vm_map_splay_merge_pred(header, entry, llist); max_free_right = vm_map_splay_merge_succ(header, entry, rlist); } else if (entry->start == root->start) { /* * The new entry is a clone of root, with only the end field * changed. The root entry will be shrunk to abut the new * entry, and will be the right child of the new root entry in * the modified map. */ KASSERT(entry->end < root->end, ("%s: clip_start not within entry", __func__)); vm_map_splay_findprev(root, &llist); root->offset += entry->end - root->start; root->start = entry->end; max_free_left = vm_map_splay_merge_pred(header, entry, llist); max_free_right = root->max_free = vm_size_max( vm_map_splay_merge_pred(entry, root, entry), vm_map_splay_merge_right(header, root, rlist)); } else { /* * The new entry is a clone of root, with only the start field * changed. The root entry will be shrunk to abut the new * entry, and will be the left child of the new root entry in * the modified map. */ KASSERT(entry->end == root->end, ("%s: clip_start not within entry", __func__)); vm_map_splay_findnext(root, &rlist); entry->offset += entry->start - root->start; root->end = entry->start; max_free_left = root->max_free = vm_size_max( vm_map_splay_merge_left(header, root, llist), vm_map_splay_merge_succ(entry, root, entry)); max_free_right = vm_map_splay_merge_succ(header, entry, rlist); } entry->max_free = vm_size_max(max_free_left, max_free_right); map->root = entry; VM_MAP_ASSERT_CONSISTENT(map); } enum unlink_merge_type { UNLINK_MERGE_NONE, UNLINK_MERGE_NEXT }; static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry, enum unlink_merge_type op) { vm_map_entry_t header, llist, rlist, root; vm_size_t max_free_left, max_free_right; VM_MAP_ASSERT_LOCKED(map); header = &map->header; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("vm_map_entry_unlink: unlink object not mapped")); vm_map_splay_findprev(root, &llist); vm_map_splay_findnext(root, &rlist); if (op == UNLINK_MERGE_NEXT) { rlist->start = root->start; rlist->offset = root->offset; } if (llist != header) { root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_succ(header, root, rlist); } else if (rlist != header) { root = rlist; rlist = root->left; max_free_left = vm_map_splay_merge_pred(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else { header->left = header->right = header; root = NULL; } if (root != NULL) root->max_free = vm_size_max(max_free_left, max_free_right); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_entry_resize: * * Resize a vm_map_entry, recompute the amount of free space that * follows it and propagate that value up the tree. * * The map must be locked, and leaves it so. */ static void vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount) { vm_map_entry_t header, llist, rlist, root; VM_MAP_ASSERT_LOCKED(map); header = &map->header; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("%s: resize object not mapped", __func__)); vm_map_splay_findnext(root, &rlist); entry->end += grow_amount; root->max_free = vm_size_max( vm_map_splay_merge_left(header, root, llist), vm_map_splay_merge_succ(header, root, rlist)); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p", __func__, map, map->nentries, entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur, header, lbound, ubound; boolean_t locked; /* * If the map is empty, then the map entry immediately preceding * "address" is the map's header. */ header = &map->header; cur = map->root; if (cur == NULL) { *entry = header; return (FALSE); } if (address >= cur->start && cur->end > address) { *entry = cur; return (TRUE); } if ((locked = vm_map_locked(map)) || sx_try_upgrade(&map->lock)) { /* * Splay requires a write lock on the map. However, it only * restructures the binary search tree; it does not otherwise * change the map. Thus, the map's timestamp need not change * on a temporary upgrade. */ cur = vm_map_splay(map, address); if (!locked) { VM_MAP_UNLOCK_CONSISTENT(map); sx_downgrade(&map->lock); } /* * If "address" is contained within a map entry, the new root * is that map entry. Otherwise, the new root is a map entry * immediately before or after "address". */ if (address < cur->start) { *entry = header; return (FALSE); } *entry = cur; return (address < cur->end); } /* * Since the map is only locked for read access, perform a * standard binary search tree lookup for "address". */ lbound = ubound = header; for (;;) { if (address < cur->start) { ubound = cur; cur = cur->left; if (cur == lbound) break; } else if (cur->end <= address) { lbound = cur; cur = cur->right; if (cur == ubound) break; } else { *entry = cur; return (TRUE); } } *entry = lbound; return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, next_entry, prev_entry; struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; u_long bdry; u_int bidx; VM_MAP_ASSERT_LOCKED(map); KASSERT(object != kernel_object || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kernel object and COW")); KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 || (cow & MAP_SPLIT_BOUNDARY_MASK) != 0, ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x", object, cow)); KASSERT((prot & ~max) == 0, ("prot %#x is not subset of max_prot %#x", prot, max)); /* * Check that the start and end points are not bogus. */ if (start == end || !vm_map_range_valid(map, start, end)) return (KERN_INVALID_ADDRESS); if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return (KERN_PROTECTION_FAILURE); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &prev_entry)) return (KERN_NO_SPACE); /* * Assert that the next entry doesn't overlap the end point. */ next_entry = vm_map_entry_succ(prev_entry); if (next_entry->start < end) return (KERN_NO_SPACE); if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || max != VM_PROT_NONE)) return (KERN_INVALID_ARGUMENT); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) protoeflags |= MAP_ENTRY_NOFAULT; if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (cow & MAP_STACK_GROWS_DOWN) protoeflags |= MAP_ENTRY_GROWS_DOWN; if (cow & MAP_STACK_GROWS_UP) protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_WRITECOUNT) protoeflags |= MAP_ENTRY_WRITECNT; if (cow & MAP_VN_EXEC) protoeflags |= MAP_ENTRY_VN_EXEC; if ((cow & MAP_CREATE_GUARD) != 0) protoeflags |= MAP_ENTRY_GUARD; if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_DN; if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_UP; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) { /* This magically ignores index 0, for usual page size. */ bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >> MAP_SPLIT_BOUNDARY_SHIFT; if (bidx >= MAXPAGESIZES) return (KERN_INVALID_ARGUMENT); bdry = pagesizes[bidx] - 1; if ((start & bdry) != 0 || (end & bdry) != 0) return (KERN_INVALID_ARGUMENT); protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; } cred = NULL; if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) goto charged; if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) return (KERN_RESOURCE_SHORTAGE); KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || object->cred == NULL, ("overcommit: vm_map_insert o %p", object)); cred = curthread->td_ucred; } charged: /* Expand the kernel pmap, if necessary. */ if (map == kernel_map && end > kernel_vm_end) pmap_growkernel(end); if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ if ((object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(object); if (object->ref_count > 1 || object->shadow_count != 0) vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); } } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) == protoeflags && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP | MAP_VN_EXEC)) == 0 && prev_entry->end == start && (prev_entry->cred == cred || (prev_entry->object.vm_object != NULL && prev_entry->object.vm_object->cred == cred)) && vm_object_coalesce(prev_entry->object.vm_object, prev_entry->offset, (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end), cred != NULL && (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if (prev_entry->inheritance == inheritance && prev_entry->protection == prot && prev_entry->max_protection == max && prev_entry->wired_count == 0) { KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) == 0, ("prev_entry %p has incoherent wiring", prev_entry)); if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += end - prev_entry->end; vm_map_entry_resize(map, prev_entry, end - prev_entry->end); vm_map_try_merge_entries(map, prev_entry, next_entry); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); if (cred != NULL && object != NULL && object->cred != NULL && !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { /* Object already accounts for this uid. */ cred = NULL; } } if (cred != NULL) crhold(cred); /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->cred = NULL; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->inheritance = inheritance; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; new_entry->wiring_thread = NULL; new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; new_entry->next_read = start; KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); new_entry->cred = cred; /* * Insert the new entry into the list */ vm_map_entry_link(map, new_entry); if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += new_entry->end - new_entry->start; /* * Try to coalesce the new entry with both the previous and next * entries in the list. Previously, we only attempted to coalesce * with the previous entry when object is NULL. Here, we handle the * other cases, which are less common. */ vm_map_try_merge_entries(map, prev_entry, new_entry); vm_map_try_merge_entries(map, new_entry, next_entry); if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * vm_map_findspace: * * Find the first fit (lowest VM address) for "length" free bytes * beginning at address >= start in the given map. * * In a vm_map_entry, "max_free" is the maximum amount of * contiguous free space between an entry in its subtree and a * neighbor of that entry. This allows finding a free region in * one path down the tree, so O(log n) amortized with splay * trees. * * The map must be locked, and leaves it so. * * Returns: starting address if sufficient space, * vm_map_max(map)-length+1 if insufficient space. */ vm_offset_t vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length) { vm_map_entry_t header, llist, rlist, root, y; vm_size_t left_length, max_free_left, max_free_right; vm_offset_t gap_end; VM_MAP_ASSERT_LOCKED(map); /* * Request must fit within min/max VM address and must avoid * address wrap. */ start = MAX(start, vm_map_min(map)); if (start >= vm_map_max(map) || length > vm_map_max(map) - start) return (vm_map_max(map) - length + 1); /* Empty tree means wide open address space. */ if (map->root == NULL) return (start); /* * After splay_split, if start is within an entry, push it to the start * of the following gap. If rlist is at the end of the gap containing * start, save the end of that gap in gap_end to see if the gap is big * enough; otherwise set gap_end to start skip gap-checking and move * directly to a search of the right subtree. */ header = &map->header; root = vm_map_splay_split(map, start, length, &llist, &rlist); gap_end = rlist->start; if (root != NULL) { start = root->end; if (root->right != rlist) gap_end = start; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else if (rlist != header) { root = rlist; rlist = root->left; max_free_left = vm_map_splay_merge_pred(header, root, llist); max_free_right = vm_map_splay_merge_right(header, root, rlist); } else { root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); max_free_right = vm_map_splay_merge_succ(header, root, rlist); } root->max_free = vm_size_max(max_free_left, max_free_right); map->root = root; VM_MAP_ASSERT_CONSISTENT(map); if (length <= gap_end - start) return (start); /* With max_free, can immediately tell if no solution. */ if (root->right == header || length > root->right->max_free) return (vm_map_max(map) - length + 1); /* * Splay for the least large-enough gap in the right subtree. */ llist = rlist = header; for (left_length = 0;; left_length = vm_map_entry_max_free_left(root, llist)) { if (length <= left_length) SPLAY_LEFT_STEP(root, y, llist, rlist, length <= vm_map_entry_max_free_left(y, llist)); else SPLAY_RIGHT_STEP(root, y, llist, rlist, length > vm_map_entry_max_free_left(y, root)); if (root == NULL) break; } root = llist; llist = root->right; max_free_left = vm_map_splay_merge_left(header, root, llist); if (rlist == header) { root->max_free = vm_size_max(max_free_left, vm_map_splay_merge_succ(header, root, rlist)); } else { y = rlist; rlist = y->left; y->max_free = vm_size_max( vm_map_splay_merge_pred(root, y, root), vm_map_splay_merge_right(header, y, rlist)); root->max_free = vm_size_max(max_free_left, y->max_free); } map->root = root; VM_MAP_ASSERT_CONSISTENT(map); return (root->end); } int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t end; int result; end = start + length; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if ((cow & MAP_CHECK_EXCL) == 0) { result = vm_map_delete(map, start, end); if (result != KERN_SUCCESS) goto out; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); } else { result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } out: vm_map_unlock(map); return (result); } static const int aslr_pages_rnd_64[2] = {0x1000, 0x10}; static const int aslr_pages_rnd_32[2] = {0x100, 0x4}; static int cluster_anon = 1; SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW, &cluster_anon, 0, "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always"); static bool clustering_anon_allowed(vm_offset_t addr) { switch (cluster_anon) { case 0: return (false); case 1: return (addr == 0); case 2: default: return (true); } } static long aslr_restarts; SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD, &aslr_restarts, 0, "Number of aslr failures"); /* * Searches for the specified amount of free space in the given map with the * specified alignment. Performs an address-ordered, first-fit search from * the given address "*addr", with an optional upper bound "max_addr". If the * parameter "alignment" is zero, then the alignment is computed from the * given (object, offset) pair so as to enable the greatest possible use of * superpage mappings. Returns KERN_SUCCESS and the address of the free space * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE. * * The map must be locked. Initially, there must be at least "length" bytes * of free space at the given address. */ static int vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment) { vm_offset_t aligned_addr, free_addr; VM_MAP_ASSERT_LOCKED(map); free_addr = *addr; KASSERT(free_addr == vm_map_findspace(map, free_addr, length), ("caller failed to provide space %#jx at address %p", (uintmax_t)length, (void *)free_addr)); for (;;) { /* * At the start of every iteration, the free space at address * "*addr" is at least "length" bytes. */ if (alignment == 0) pmap_align_superpage(object, offset, addr, length); else *addr = roundup2(*addr, alignment); aligned_addr = *addr; if (aligned_addr == free_addr) { /* * Alignment did not change "*addr", so "*addr" must * still provide sufficient free space. */ return (KERN_SUCCESS); } /* * Test for address wrap on "*addr". A wrapped "*addr" could * be a valid address, in which case vm_map_findspace() cannot * be relied upon to fail. */ if (aligned_addr < free_addr) return (KERN_NO_SPACE); *addr = vm_map_findspace(map, aligned_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) return (KERN_NO_SPACE); free_addr = *addr; if (free_addr == aligned_addr) { /* * If a successful call to vm_map_findspace() did not * change "*addr", then "*addr" must still be aligned * and provide sufficient free space. */ return (KERN_SUCCESS); } } } int vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment) { /* XXXKIB ASLR eh ? */ *addr = vm_map_findspace(map, *addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) return (KERN_NO_SPACE); return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr, alignment)); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t alignment, curr_min_addr, min_addr; int gap, pidx, rv, try; bool cluster, en_aslr, update_anon; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_find: non-NULL backing object for stack")); MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || (object->flags & OBJ_COLORED) == 0)) find_space = VMFS_ANY_SPACE; if (find_space >> 8 != 0) { KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); alignment = (vm_offset_t)1 << (find_space >> 8); } else alignment = 0; en_aslr = (map->flags & MAP_ASLR) != 0; update_anon = cluster = clustering_anon_allowed(*addr) && (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 && find_space != VMFS_NO_SPACE && object == NULL && (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP | MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE; curr_min_addr = min_addr = *addr; if (en_aslr && min_addr == 0 && !cluster && find_space != VMFS_NO_SPACE && (map->flags & MAP_ASLR_IGNSTART) != 0) curr_min_addr = min_addr = vm_map_min(map); try = 0; vm_map_lock(map); if (cluster) { curr_min_addr = map->anon_loc; if (curr_min_addr == 0) cluster = false; } if (find_space != VMFS_NO_SPACE) { KASSERT(find_space == VMFS_ANY_SPACE || find_space == VMFS_OPTIMAL_SPACE || find_space == VMFS_SUPER_SPACE || alignment != 0, ("unexpected VMFS flag")); again: /* * When creating an anonymous mapping, try clustering * with an existing anonymous mapping first. * * We make up to two attempts to find address space * for a given find_space value. The first attempt may * apply randomization or may cluster with an existing * anonymous mapping. If this first attempt fails, * perform a first-fit search of the available address * space. * * If all tries failed, and find_space is * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE. * Again enable clustering and randomization. */ try++; MPASS(try <= 2); if (try == 2) { /* * Second try: we failed either to find a * suitable region for randomizing the * allocation, or to cluster with an existing * mapping. Retry with free run. */ curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ? vm_map_min(map) : min_addr; atomic_add_long(&aslr_restarts, 1); } if (try == 1 && en_aslr && !cluster) { /* * Find space for allocation, including * gap needed for later randomization. */ pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 && (find_space == VMFS_SUPER_SPACE || find_space == VMFS_OPTIMAL_SPACE) ? 1 : 0; gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR && (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ? aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx]; *addr = vm_map_findspace(map, curr_min_addr, length + gap * pagesizes[pidx]); if (*addr + length + gap * pagesizes[pidx] > vm_map_max(map)) goto again; /* And randomize the start address. */ *addr += (arc4random() % gap) * pagesizes[pidx]; if (max_addr != 0 && *addr + length > max_addr) goto again; } else { *addr = vm_map_findspace(map, curr_min_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) { if (cluster) { cluster = false; MPASS(try == 1); goto again; } rv = KERN_NO_SPACE; goto done; } } if (find_space != VMFS_ANY_SPACE && (rv = vm_map_alignspace(map, object, offset, addr, length, max_addr, alignment)) != KERN_SUCCESS) { if (find_space == VMFS_OPTIMAL_SPACE) { find_space = VMFS_ANY_SPACE; curr_min_addr = min_addr; cluster = update_anon; try = 0; goto again; } goto done; } } else if ((cow & MAP_REMAP) != 0) { if (!vm_map_range_valid(map, *addr, *addr + length)) { rv = KERN_INVALID_ADDRESS; goto done; } rv = vm_map_delete(map, *addr, *addr + length); if (rv != KERN_SUCCESS) goto done; } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, max, cow); } else { rv = vm_map_insert(map, object, offset, *addr, *addr + length, prot, max, cow); } if (rv == KERN_SUCCESS && update_anon) map->anon_loc = *addr + length; done: vm_map_unlock(map); return (rv); } /* * vm_map_find_min() is a variant of vm_map_find() that takes an * additional parameter (min_addr) and treats the given address * (*addr) differently. Specifically, it treats *addr as a hint * and not as the minimum address where the mapping is created. * * This function works in two phases. First, it tries to * allocate above the hint. If that fails and the hint is * greater than min_addr, it performs a second pass, replacing * the hint with min_addr as the minimum address for the * allocation. */ int vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t hint; int rv; hint = *addr; for (;;) { rv = vm_map_find(map, object, offset, addr, length, max_addr, find_space, prot, max, cow); if (rv == KERN_SUCCESS || min_addr >= hint) return (rv); *addr = hint = min_addr; } } /* * A map entry with any of the following flags set must not be merged with * another entry. */ #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \ MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC) static bool vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry) { KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 || (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0, ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable", prev, entry)); return (prev->end == entry->start && prev->object.vm_object == entry->object.vm_object && (prev->object.vm_object == NULL || prev->offset + (prev->end - prev->start) == entry->offset) && prev->eflags == entry->eflags && prev->protection == entry->protection && prev->max_protection == entry->max_protection && prev->inheritance == entry->inheritance && prev->wired_count == entry->wired_count && prev->cred == entry->cred); } static void vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry) { /* * If the backing object is a vnode object, vm_object_deallocate() * calls vrele(). However, vrele() does not lock the vnode because * the vnode has additional references. Thus, the map lock can be * kept without causing a lock-order reversal with the vnode lock. * * Since we count the number of virtual page mappings in * object->un_pager.vnp.writemappings, the writemappings value * should not be adjusted when the entry is disposed of. */ if (entry->object.vm_object != NULL) vm_object_deallocate(entry->object.vm_object); if (entry->cred != NULL) crfree(entry->cred); vm_map_entry_dispose(map, entry); } /* * vm_map_try_merge_entries: * * Compare the given map entry to its predecessor, and merge its precessor * into it if possible. The entry remains valid, and may be extended. * The predecessor may be deleted. * * The map must be locked. */ void vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 && vm_map_mergeable_neighbors(prev_entry, entry)) { vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT); vm_map_merged_neighbor_dispose(map, prev_entry); } } /* * vm_map_entry_back: * * Allocate an object to back a map entry. */ static inline void vm_map_entry_back(vm_map_entry_t entry) { vm_object_t object; KASSERT(entry->object.vm_object == NULL, ("map entry %p has backing object", entry)); KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("map entry %p is a submap", entry)); object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL, entry->cred, entry->end - entry->start); entry->object.vm_object = object; entry->offset = 0; entry->cred = NULL; } /* * vm_map_entry_charge_object * * If there is no object backing this entry, create one. Otherwise, if * the entry has cred, give it to the backing object. */ static inline void vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("map entry %p is a submap", entry)); if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) vm_map_entry_back(entry); else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: %s: both cred e %p", __func__, entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } } /* * vm_map_entry_clone * * Create a duplicate map entry for clipping. */ static vm_map_entry_t vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); /* * Create a backing object now, if none exists, so that more individual * objects won't be created after the map entry is split. */ vm_map_entry_charge_object(map, entry); /* Clone the entry. */ new_entry = vm_map_entry_create(map); *new_entry = *entry; if (new_entry->cred != NULL) crhold(entry->cred); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); vm_map_entry_set_vnode_text(new_entry, true); /* * The object->un_pager.vnp.writemappings for the object of * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The * virtual pages are re-distributed among the clipped entries, * so the sum is left the same. */ } return (new_entry); } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ static int vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr) { vm_map_entry_t new_entry; int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p start 0x%jx", __func__, map, entry, (uintmax_t)startaddr); if (startaddr <= entry->start) return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->end > startaddr && entry->start < startaddr, ("%s: invalid clip of entry %p", __func__, entry)); bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (bdry_idx != 0) { if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0) return (KERN_INVALID_ARGUMENT); } new_entry = vm_map_entry_clone(map, entry); /* * Split off the front portion. Insert the new entry BEFORE this one, * so that this entry has the specified starting address. */ new_entry->end = startaddr; vm_map_entry_link(map, new_entry); return (KERN_SUCCESS); } /* * vm_map_lookup_clip_start: * * Find the entry at or just after 'start', and clip it if 'start' is in * the interior of the entry. Return entry after 'start', and in * prev_entry set the entry before 'start'. */ static int vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start, vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry) { vm_map_entry_t entry; int rv; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p start 0x%jx prev %p", __func__, map, (uintmax_t)start, prev_entry); if (vm_map_lookup_entry(map, start, prev_entry)) { entry = *prev_entry; rv = vm_map_clip_start(map, entry, start); if (rv != KERN_SUCCESS) return (rv); *prev_entry = vm_map_entry_pred(entry); } else entry = vm_map_entry_succ(*prev_entry); *res_entry = entry; return (KERN_SUCCESS); } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ static int vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr) { vm_map_entry_t new_entry; int bdry_idx; if (!map->system_map) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: map %p entry %p end 0x%jx", __func__, map, entry, (uintmax_t)endaddr); if (endaddr >= entry->end) return (KERN_SUCCESS); VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->start < endaddr && entry->end > endaddr, ("%s: invalid clip of entry %p", __func__, entry)); bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (bdry_idx != 0) { if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0) return (KERN_INVALID_ARGUMENT); } new_entry = vm_map_entry_clone(map, entry); /* * Split off the back portion. Insert the new entry AFTER this one, * so that this entry has the specified ending address. */ new_entry->start = endaddr; vm_map_entry_link(map, new_entry); return (KERN_SUCCESS); } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result; result = KERN_INVALID_ARGUMENT; vm_map_lock(submap); submap->flags |= MAP_IS_SUB_MAP; vm_map_unlock(submap); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end && (entry->eflags & MAP_ENTRY_COW) == 0 && entry->object.vm_object == NULL) { result = vm_map_clip_start(map, entry, start); if (result != KERN_SUCCESS) goto unlock; result = vm_map_clip_end(map, entry, end); if (result != KERN_SUCCESS) goto unlock; entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } unlock: vm_map_unlock(map); if (result != KERN_SUCCESS) { vm_map_lock(submap); submap->flags &= ~MAP_IS_SUB_MAP; vm_map_unlock(submap); } return (result); } /* * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload the specified map's pmap with mappings to the specified * object's memory-resident pages. No further physical pages are * allocated, and no further virtual pages are retrieved from secondary * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within * the specified address range are mapped. */ static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t start; vm_page_t p, p_start; vm_pindex_t mask, psize, threshold, tmpidx; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_LOCK_DOWNGRADE(object); } else VM_OBJECT_RLOCK(object); psize = atop(size); if (psize + pindex > object->size) { if (pindex >= object->size) { VM_OBJECT_RUNLOCK(object); return; } psize = object->size - pindex; } start = 0; p_start = NULL; threshold = MAX_INIT_PT; p = vm_page_find_least(object, pindex); /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && vm_page_count_severe()) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; break; } if (vm_page_all_valid(p)) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; } /* Jump ahead if a superpage mapping is possible. */ if (p->psind > 0 && ((addr + ptoa(tmpidx)) & (pagesizes[p->psind] - 1)) == 0) { mask = atop(pagesizes[p->psind]) - 1; if (tmpidx + mask < psize && vm_page_ps_test(p, PS_ALL_VALID, NULL)) { p += mask; threshold += mask; } } } else if (p_start != NULL) { pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } if (p_start != NULL) pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); VM_OBJECT_RUNLOCK(object); } /* * vm_map_protect: * * Sets the protection and/or the maximum protection of the * specified address region in the target map. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, vm_prot_t new_maxprot, int flags) { vm_map_entry_t entry, first_entry, in_tran, prev_entry; vm_object_t obj; struct ucred *cred; vm_prot_t old_prot; int rv; if (start == end) return (KERN_SUCCESS); if ((flags & (VM_MAP_PROTECT_SET_PROT | VM_MAP_PROTECT_SET_MAXPROT)) == (VM_MAP_PROTECT_SET_PROT | VM_MAP_PROTECT_SET_MAXPROT) && (new_prot & new_maxprot) != new_prot) return (KERN_OUT_OF_BOUNDS); again: in_tran = NULL; vm_map_lock(map); if ((map->flags & MAP_WXORX) != 0 && (flags & VM_MAP_PROTECT_SET_PROT) != 0 && (new_prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } /* * Ensure that we are not concurrently wiring pages. vm_map_wire() may * need to fault pages into the map and will drop the map lock while * doing so, and the VM object may end up in an inconsistent state if we * update the protection on the map entry in between faults. */ vm_map_wait_busy(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) first_entry = vm_map_entry_succ(first_entry); /* * Make a first pass to check for protection violations. */ for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_GUARD) != 0) continue; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((flags & VM_MAP_PROTECT_SET_PROT) == 0) new_prot = entry->protection; if ((flags & VM_MAP_PROTECT_SET_MAXPROT) == 0) new_maxprot = entry->max_protection; if ((new_prot & entry->max_protection) != new_prot || (new_maxprot & entry->max_protection) != new_maxprot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) in_tran = entry; } /* * Postpone the operation until all in-transition map entries have * stabilized. An in-transition entry might already have its pages * wired and wired_count incremented, but not yet have its * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call * vm_fault_copy_entry() in the final loop below. */ if (in_tran != NULL) { in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(map, 0); goto again; } /* * Before changing the protections, try to reserve swap space for any * private (i.e., copy-on-write) mappings that are transitioning from * read-only to read/write access. If a reservation fails, break out * of this loop early and let the next loop simplify the entries, since * some may now be mergeable. */ rv = vm_map_clip_start(map, first_entry, start); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (rv); } for (entry = first_entry; entry->start < end; entry = vm_map_entry_succ(entry)) { rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (rv); } if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 || ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 || ENTRY_CHARGED(entry) || (entry->eflags & MAP_ENTRY_GUARD) != 0) continue; cred = curthread->td_ucred; obj = entry->object.vm_object; if (obj == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) { if (!swap_reserve(entry->end - entry->start)) { rv = KERN_RESOURCE_SHORTAGE; end = entry->end; break; } crhold(cred); entry->cred = cred; continue; } VM_OBJECT_WLOCK(obj); - if (obj->type != OBJT_DEFAULT && - (obj->flags & OBJ_SWAP) == 0) { + if ((obj->flags & OBJ_SWAP) == 0) { VM_OBJECT_WUNLOCK(obj); continue; } /* * Charge for the whole object allocation now, since * we cannot distinguish between non-charged and * charged clipped mapping of the same object later. */ KASSERT(obj->charge == 0, ("vm_map_protect: object %p overcharged (entry %p)", obj, entry)); if (!swap_reserve(ptoa(obj->size))) { VM_OBJECT_WUNLOCK(obj); rv = KERN_RESOURCE_SHORTAGE; end = entry->end; break; } crhold(cred); obj->cred = cred; obj->charge = ptoa(obj->size); VM_OBJECT_WUNLOCK(obj); } /* * If enough swap space was available, go back and fix up protections. * Otherwise, just simplify entries, since some may have been modified. * [Note that clipping is not necessary the second time.] */ for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry; entry->start < end; vm_map_try_merge_entries(map, prev_entry, entry), prev_entry = entry, entry = vm_map_entry_succ(entry)) { if (rv != KERN_SUCCESS || (entry->eflags & MAP_ENTRY_GUARD) != 0) continue; old_prot = entry->protection; if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) { entry->max_protection = new_maxprot; entry->protection = new_maxprot & old_prot; } if ((flags & VM_MAP_PROTECT_SET_PROT) != 0) entry->protection = new_prot; /* * For user wired map entries, the normal lazy evaluation of * write access upgrades through soft page faults is * undesirable. Instead, immediately copy any pages that are * copy-on-write and enable write access in the physical map. */ if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 && (entry->protection & VM_PROT_WRITE) != 0 && (old_prot & VM_PROT_WRITE) == 0) vm_fault_copy_entry(map, map, entry, entry, NULL); /* * When restricting access, update the physical map. Worry * about copy-on-write here. */ if ((old_prot & ~entry->protection) != 0) { #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, entry->start, entry->end, entry->protection & MASK(entry)); #undef MASK } } vm_map_try_merge_entries(map, prev_entry, entry); vm_map_unlock(map); return (rv); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t entry, prev_entry; int rv; bool modify_map; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: if (start == end) return (0); modify_map = true; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: if (start == end) return (0); modify_map = false; vm_map_lock_read(map); break; default: return (EINVAL); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (vm_mmap_to_errno(rv)); } for (; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) { vm_map_unlock(map); return (vm_mmap_to_errno(rv)); } switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(entry, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(entry, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(entry, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: entry->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: entry->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: entry->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: entry->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); vm_map_unlock(map); } else { vm_pindex_t pstart, pend; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ if (!vm_map_lookup_entry(map, start, &entry)) entry = vm_map_entry_succ(entry); for (; entry->start < end; entry = vm_map_entry_succ(entry)) { vm_offset_t useEnd, useStart; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; /* * MADV_FREE would otherwise rewind time to * the creation of the shadow object. Because * we hold the VM map read-locked, neither the * entry's object nor the presence of a * backing object can change. */ if (behav == MADV_FREE && entry->object.vm_object != NULL && entry->object.vm_object->backing_object != NULL) continue; pstart = OFF_TO_IDX(entry->offset); pend = pstart + atop(entry->end - entry->start); useStart = entry->start; useEnd = entry->end; if (entry->start < start) { pstart += atop(start - entry->start); useStart = start; } if (entry->end > end) { pend -= atop(entry->end - end); useEnd = end; } if (pstart >= pend) continue; /* * Perform the pmap_advise() before clearing * PGA_REFERENCED in vm_page_advise(). Otherwise, a * concurrent pmap operation, such as pmap_remove(), * could clear a reference in the pmap and set * PGA_REFERENCED on the page before the pmap_advise() * had completed. Consequently, the page would appear * referenced based upon an old reference that * occurred before this pmap_advise() ran. */ if (behav == MADV_DONTNEED || behav == MADV_FREE) pmap_advise(map->pmap, useStart, useEnd, behav); vm_object_madvise(entry->object.vm_object, pstart, pend, behav); /* * Pre-populate paging structures in the * WILLNEED case. For wired entries, the * paging structures are already populated. */ if (behav == MADV_WILLNEED && entry->wired_count == 0) { vm_map_pmap_enter(map, useStart, entry->protection, entry->object.vm_object, pstart, ptoa(pend - pstart), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vmspace_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry, lentry, prev_entry, start_entry; int rv; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: case VM_INHERIT_ZERO: break; default: return (KERN_INVALID_ARGUMENT); } if (start == end) return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry); if (rv != KERN_SUCCESS) goto unlock; if (vm_map_lookup_entry(map, end - 1, &lentry)) { rv = vm_map_clip_end(map, lentry, end); if (rv != KERN_SUCCESS) goto unlock; } if (new_inheritance == VM_INHERIT_COPY) { for (entry = start_entry; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { rv = KERN_INVALID_ARGUMENT; goto unlock; } } } for (entry = start_entry; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx", entry, (uintmax_t)entry->end, (uintmax_t)end)); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) entry->inheritance = new_inheritance; vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); unlock: vm_map_unlock(map); return (rv); } /* * vm_map_entry_in_transition: * * Release the map lock, and sleep until the entry is no longer in * transition. Awake and acquire the map lock. If the map changed while * another held the lock, lookup a possibly-changed entry at or after the * 'start' position of the old entry. */ static vm_map_entry_t vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start, vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry) { vm_map_entry_t entry; vm_offset_t start; u_int last_timestamp; VM_MAP_ASSERT_LOCKED(map); KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("not in-tranition map entry %p", in_entry)); /* * We have not yet clipped the entry. */ start = MAX(in_start, in_entry->start); in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user wiring/unwiring? */ } vm_map_lock(map); if (last_timestamp + 1 == map->timestamp) return (in_entry); /* * Look again for the entry because the map was modified while it was * unlocked. Specifically, the entry may have been clipped, merged, or * deleted. */ if (!vm_map_lookup_entry(map, start, &entry)) { if (!holes_ok) { *io_end = start; return (NULL); } entry = vm_map_entry_succ(entry); } return (entry); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, next_entry, prev_entry; int rv; bool holes_ok, need_wakeup, user_unwire; if (start == end) return (KERN_SUCCESS); holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; user_unwire = (flags & VM_MAP_WIRE_USER) != 0; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (holes_ok) first_entry = vm_map_entry_succ(first_entry); else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } rv = KERN_SUCCESS; for (entry = first_entry; entry->start < end; entry = next_entry) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ next_entry = vm_map_entry_in_transition(map, start, &end, holes_ok, entry); if (next_entry == NULL) { if (entry == first_entry) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } rv = KERN_INVALID_ADDRESS; break; } first_entry = (entry == first_entry) ? next_entry : NULL; continue; } rv = vm_map_clip_start(map, entry, start); if (rv != KERN_SUCCESS) break; rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) break; /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; next_entry = vm_map_entry_succ(entry); /* * Check the map for holes in the specified region. * If holes_ok, skip this check. */ if (!holes_ok && entry->end < end && next_entry->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; break; } /* * If system unwiring, require that the entry is system wired. */ if (!user_unwire && vm_map_entry_system_wired_count(entry) == 0) { end = entry->end; rv = KERN_INVALID_ARGUMENT; break; } } need_wakeup = false; if (first_entry == NULL && !vm_map_lookup_entry(map, start, &first_entry)) { KASSERT(holes_ok, ("vm_map_unwire: lookup failed")); prev_entry = first_entry; entry = vm_map_entry_succ(first_entry); } else { prev_entry = vm_map_entry_pred(first_entry); entry = first_entry; } for (; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { /* * If holes_ok was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for draining * MAP_ENTRY_IN_TRANSITION. Moreover, another thread * could be simultaneously wiring this new mapping * entry. Detect these cases and skip any entries * marked as in transition by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT(holes_ok, ("vm_map_unwire: !HOLESOK and new/changed entry")); continue; } if (rv == KERN_SUCCESS && (!user_unwire || (entry->eflags & MAP_ENTRY_USER_WIRED))) { if (entry->wired_count == 1) vm_map_entry_unwire(map, entry); else entry->wired_count--; if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_unwire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_unwire: alien wire %p", entry)); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = true; } vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } static void vm_map_wire_user_count_sub(u_long npages) { atomic_subtract_long(&vm_user_wire_count, npages); } static bool vm_map_wire_user_count_add(u_long npages) { u_long wired; wired = vm_user_wire_count; do { if (npages + wired > vm_page_max_user_wired) return (false); } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired, npages + wired)); return (true); } /* * vm_map_wire_entry_failure: * * Handle a wiring failure on the given entry. * * The map should be locked. */ static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && entry->wired_count == 1, ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); KASSERT(failed_addr < entry->end, ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); /* * If any pages at the start of this entry were successfully wired, * then unwire them. */ if (failed_addr > entry->start) { pmap_unwire(map->pmap, entry->start, failed_addr); vm_object_unwire(entry->object.vm_object, entry->offset, failed_addr - entry->start, PQ_ACTIVE); } /* * Assign an out-of-range value to represent the failure to wire this * entry. */ entry->wired_count = -1; } int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { int rv; vm_map_lock(map); rv = vm_map_wire_locked(map, start, end, flags); vm_map_unlock(map); return (rv); } /* * vm_map_wire_locked: * * Implements both kernel and user wiring. Returns with the map locked, * the map lock may be dropped. */ int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, next_entry, prev_entry; vm_offset_t faddr, saved_end, saved_start; u_long incr, npages; u_int bidx, last_timestamp; int rv; bool holes_ok, need_wakeup, user_wire; vm_prot_t prot; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); prot = 0; if (flags & VM_MAP_WIRE_WRITE) prot |= VM_PROT_WRITE; holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0; user_wire = (flags & VM_MAP_WIRE_USER) != 0; VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (holes_ok) first_entry = vm_map_entry_succ(first_entry); else return (KERN_INVALID_ADDRESS); } for (entry = first_entry; entry->start < end; entry = next_entry) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ next_entry = vm_map_entry_in_transition(map, start, &end, holes_ok, entry); if (next_entry == NULL) { if (entry == first_entry) return (KERN_INVALID_ADDRESS); rv = KERN_INVALID_ADDRESS; goto done; } first_entry = (entry == first_entry) ? next_entry : NULL; continue; } rv = vm_map_clip_start(map, entry, start); if (rv != KERN_SUCCESS) goto done; rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) goto done; /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || (entry->protection & prot) != prot) { entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; if (!holes_ok) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } } else if (entry->wired_count == 0) { entry->wired_count++; npages = atop(entry->end - entry->start); if (user_wire && !vm_map_wire_user_count_add(npages)) { vm_map_wire_entry_failure(map, entry, entry->start); end = entry->end; rv = KERN_RESOURCE_SHORTAGE; goto done; } /* * Release the map lock, relying on the in-transition * mark. Mark the map busy for fork. */ saved_start = entry->start; saved_end = entry->end; last_timestamp = map->timestamp; bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; incr = pagesizes[bidx]; vm_map_busy(map); vm_map_unlock(map); for (faddr = saved_start; faddr < saved_end; faddr += incr) { /* * Simulate a fault to get the page and enter * it into the physical map. */ rv = vm_fault(map, faddr, VM_PROT_NONE, VM_FAULT_WIRE, NULL); if (rv != KERN_SUCCESS) break; } vm_map_lock(map); vm_map_unbusy(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ if (!vm_map_lookup_entry(map, saved_start, &next_entry)) KASSERT(false, ("vm_map_wire: lookup failed")); first_entry = (entry == first_entry) ? next_entry : NULL; for (entry = next_entry; entry->end < saved_end; entry = vm_map_entry_succ(entry)) { /* * In case of failure, handle entries * that were not fully wired here; * fully wired entries are handled * later. */ if (rv != KERN_SUCCESS && faddr < entry->end) vm_map_wire_entry_failure(map, entry, faddr); } } if (rv != KERN_SUCCESS) { vm_map_wire_entry_failure(map, entry, faddr); if (user_wire) vm_map_wire_user_count_sub(npages); end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If holes_ok was specified, skip this check. */ next_entry = vm_map_entry_succ(entry); if (!holes_ok && entry->end < end && next_entry->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } } rv = KERN_SUCCESS; done: need_wakeup = false; if (first_entry == NULL && !vm_map_lookup_entry(map, start, &first_entry)) { KASSERT(holes_ok, ("vm_map_wire: lookup failed")); prev_entry = first_entry; entry = vm_map_entry_succ(first_entry); } else { prev_entry = vm_map_entry_pred(first_entry); entry = first_entry; } for (; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { /* * If holes_ok was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for faulting in the * pages or draining MAP_ENTRY_IN_TRANSITION. * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition not by us. * * Another way to get an entry not marked with * MAP_ENTRY_IN_TRANSITION is after failed clipping, * which set rv to KERN_INVALID_ARGUMENT. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) { /* do nothing */ } else if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { /* * Undo the wiring. Wiring succeeded on this entry * but failed on a later entry. */ if (entry->wired_count == 1) { vm_map_entry_unwire(map, entry); if (user_wire) vm_map_wire_user_count_sub( atop(entry->end - entry->start)); } else entry->wired_count--; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_wire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_wire: alien wire %p", entry)); entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WIRE_SKIPPED); entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = true; } vm_map_try_merge_entries(map, prev_entry, entry); } vm_map_try_merge_entries(map, prev_entry, entry); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t entry, first_entry, next_entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; int bdry_idx; boolean_t failed; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = first_entry->start; end = first_entry->end; } /* * Make a first pass to check for user-wired memory, holes, * and partial invalidation of largepage mappings. */ for (entry = first_entry; entry->start < end; entry = next_entry) { if (invalidate) { if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT; if (bdry_idx != 0 && ((start & (pagesizes[bdry_idx] - 1)) != 0 || (end & (pagesizes[bdry_idx] - 1)) != 0)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } } next_entry = vm_map_entry_succ(entry); if (end > entry->end && entry->end != next_entry->start) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) pmap_remove(map->pmap, start, end); failed = FALSE; /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (entry = first_entry; entry->start < end;) { offset = entry->offset + (start - entry->start); size = (end <= entry->end ? end : entry->end) - start; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = entry->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = entry->object.vm_object; } vm_object_reference(object); last_timestamp = map->timestamp; vm_map_unlock_read(map); if (!vm_object_sync(object, offset, size, syncio, invalidate)) failed = TRUE; start += size; vm_object_deallocate(object); vm_map_lock_read(map); if (last_timestamp == map->timestamp || !vm_map_lookup_entry(map, start, &entry)) entry = vm_map_entry_succ(entry); } vm_map_unlock_read(map); return (failed ? KERN_FAILURE : KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { vm_size_t size; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->wired_count > 0, ("vm_map_entry_unwire: entry %p isn't wired", entry)); size = entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) vm_map_wire_user_count_sub(atop(size)); pmap_unwire(map->pmap, entry->start, entry->end); vm_object_unwire(entry->object.vm_object, entry->offset, size, PQ_ACTIVE); entry->wired_count = 0; } static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) vm_object_deallocate(entry->object.vm_object); uma_zfree(system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, size1; vm_size_t size; vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE); object = entry->object.vm_object; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { MPASS(entry->cred == NULL); MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); MPASS(object == NULL); vm_map_entry_deallocate(entry, map->system_map); return; } size = entry->end - entry->start; map->size -= size; if (entry->cred != NULL) { swap_release_by_cred(size, entry->cred); crfree(entry->cred); } if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) { entry->object.vm_object = NULL; } else if ((object->flags & OBJ_ANON) != 0 || object == kernel_object) { KASSERT(entry->cred == NULL || object->cred == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY), ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + atop(size); VM_OBJECT_WLOCK(object); if (object->ref_count != 1 && ((object->flags & OBJ_ONEMAPPING) != 0 || object == kernel_object)) { vm_object_collapse(object); /* * The option OBJPR_NOTMAPPED can be passed here * because vm_map_delete() already performed * pmap_remove() on the only mapping to this range * of pages. */ vm_object_page_remove(object, offidxstart, offidxend, OBJPR_NOTMAPPED); if (offidxend >= object->size && offidxstart < object->size) { size1 = object->size; object->size = offidxstart; if (object->cred != NULL) { size1 -= object->size; KASSERT(object->charge >= ptoa(size1), ("object %p charge < 0", object)); swap_release_by_cred(ptoa(size1), object->cred); object->charge -= ptoa(size1); } } } VM_OBJECT_WUNLOCK(object); } if (map->system_map) vm_map_entry_deallocate(entry, TRUE); else { entry->defer_next = curthread->td_map_def_user; curthread->td_map_def_user = entry; } } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry, next_entry, scratch_entry; int rv; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); /* * Find the start of the region, and clip it. * Step through all entries in this region. */ rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry); if (rv != KERN_SUCCESS) return (rv); for (; entry->start < end; entry = next_entry) { /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on * user maps. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && vm_map_entry_system_wired_count(entry) != 0)) { unsigned int last_timestamp; vm_offset_t saved_start; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, 0); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ rv = vm_map_lookup_clip_start(map, saved_start, &next_entry, &scratch_entry); if (rv != KERN_SUCCESS) break; } else next_entry = entry; continue; } /* XXXKIB or delete to the upper superpage boundary ? */ rv = vm_map_clip_end(map, entry, end); if (rv != KERN_SUCCESS) break; next_entry = vm_map_entry_succ(entry); /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); /* * Remove mappings for the pages, but only if the * mappings could exist. For instance, it does not * make sense to call pmap_remove() for guard entries. */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || entry->object.vm_object != NULL) pmap_remove(map->pmap, entry->start, entry->end); if (entry->end == map->anon_loc) map->anon_loc = entry->start; /* * Delete the entry only after removing all pmap * entries pointing to its pages. (Otherwise, its * page frames may be reallocated, and any modify bits * will be set in the wrong object!) */ vm_map_entry_delete(map, entry); } return (rv); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = vm_map_entry_succ(entry); } return (TRUE); } /* * * vm_map_copy_swap_object: * * Copies a swap-backed object from an existing map entry to a * new one. Carries forward the swap charge. May change the * src object on return. */ static void vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_offset_t size, vm_ooffset_t *fork_charge) { vm_object_t src_object; struct ucred *cred; int charged; src_object = src_entry->object.vm_object; charged = ENTRY_CHARGED(src_entry); if ((src_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(src_object); vm_object_collapse(src_object); if ((src_object->flags & OBJ_ONEMAPPING) != 0) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(src_object); } else vm_object_reference(src_object); if (src_entry->cred != NULL && !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { KASSERT(src_object->cred == NULL, ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p", src_object)); src_object->cred = src_entry->cred; src_object->charge = size; } dst_entry->object.vm_object = src_object; if (charged) { cred = curthread->td_ucred; crhold(cred); dst_entry->cred = cred; *fork_charge += size; if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { crhold(cred); src_entry->cred = cred; *fork_charge += size; } } } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_ooffset_t *fork_charge) { vm_object_t src_object; vm_map_entry_t fake_entry; vm_offset_t size; VM_MAP_ASSERT_LOCKED(dst_map); if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0 || (src_entry->protection & VM_PROT_WRITE) == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && (src_entry->protection & VM_PROT_WRITE) != 0) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } /* * Make a copy of the object. */ size = src_entry->end - src_entry->start; if ((src_object = src_entry->object.vm_object) != NULL) { - /* - * Swap-backed objects need special handling. Note that - * this is an unlocked check, so it is possible to race - * with an OBJT_DEFAULT -> OBJT_SWAP conversion. - */ - if (src_object->type == OBJT_DEFAULT || - src_object->type == OBJT_SWAP || - (src_object->flags & OBJ_SWAP) != 0) { + if ((src_object->flags & OBJ_SWAP) != 0) { vm_map_copy_swap_object(src_entry, dst_entry, size, fork_charge); /* May have split/collapsed, reload obj. */ src_object = src_entry->object.vm_object; } else { vm_object_reference(src_object); dst_entry->object.vm_object = src_object; } src_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->offset = src_entry->offset; if (src_entry->eflags & MAP_ENTRY_WRITECNT) { /* * MAP_ENTRY_WRITECNT cannot * indicate write reference from * src_entry, since the entry is * marked as needs copy. Allocate a * fake entry that is used to * decrement object->un_pager writecount * at the appropriate time. Attach * fake_entry to the deferred list. */ fake_entry = vm_map_entry_create(dst_map); fake_entry->eflags = MAP_ENTRY_WRITECNT; src_entry->eflags &= ~MAP_ENTRY_WRITECNT; vm_object_reference(src_object); fake_entry->object.vm_object = src_object; fake_entry->start = src_entry->start; fake_entry->end = src_entry->end; fake_entry->defer_next = curthread->td_map_def_user; curthread->td_map_def_user = fake_entry; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; if (src_entry->cred != NULL) { dst_entry->cred = curthread->td_ucred; crhold(dst_entry->cred); *fork_charge += size; } } } else { /* * We don't want to make writeable wired pages copy-on-write. * Immediately copy these pages into the new map by simulating * page faults. The new pages are pageable. */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, fork_charge); } } /* * vmspace_map_entry_forked: * Update the newly-forked vmspace each time a map entry is inherited * or copied. The values for vm_dsize and vm_tsize are approximate * (and mostly-obsolete ideas in the face of mmap(2) et al.) */ static void vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_map_entry_t entry) { vm_size_t entrysize; vm_offset_t newend; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) return; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { vm2->vm_ssize += btoc(entrysize); } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); vm2->vm_dsize += btoc(newend - entry->start); } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); vm2->vm_tsize += btoc(newend - entry->start); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * XXX It might be worth coalescing the entries added to the new vmspace. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) { struct vmspace *vm2; vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; int error, locked __diagused; vm_inherit_t inh; old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), pmap_pinit); if (vm2 == NULL) return (NULL); vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; vm2->vm_stacktop = vm1->vm_stacktop; vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); new_map = &vm2->vm_map; locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); if (error != 0) { sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); vmspace_free(vm2); return (NULL); } new_map->anon_loc = old_map->anon_loc; new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART | MAP_ASLR_STACK | MAP_WXORX); VM_MAP_ENTRY_FOREACH(old_entry, old_map) { if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) panic("vm_map_fork: encountered a submap"); inh = old_entry->inheritance; if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && inh != VM_INHERIT_NONE) inh = VM_INHERIT_COPY; switch (inh) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if * necessary. */ object = old_entry->object.vm_object; if (object == NULL) { vm_map_entry_back(old_entry); object = old_entry->object.vm_object; } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, old_entry->end - old_entry->start, old_entry->cred, /* Transfer the second reference too. */ true); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; old_entry->cred = NULL; /* * As in vm_map_merged_neighbor_dispose(), * the vnode lock will not be acquired in * this call to vm_object_deallocate(). */ vm_object_deallocate(object); object = old_entry->object.vm_object; } else { VM_OBJECT_WLOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); if (old_entry->cred != NULL) { KASSERT(object->cred == NULL, ("vmspace_fork both cred")); object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } /* * Assert the correct state of the vnode * v_writecount while the object is locked, to * not relock it later for the assertion * correctness. */ if (old_entry->eflags & MAP_ENTRY_WRITECNT && object->type == OBJT_VNODE) { KASSERT(((struct vnode *)object-> handle)->v_writecount > 0, ("vmspace_fork: v_writecount %p", object)); KASSERT(object->un_pager.vnp. writemappings > 0, ("vmspace_fork: vnp.writecount %p", object)); } VM_OBJECT_WUNLOCK(object); } /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; if (new_entry->eflags & MAP_ENTRY_WRITECNT) { vm_pager_update_writecount(object, new_entry->start, new_entry->end); } vm_map_entry_set_vnode_text(new_entry, true); /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; /* * Copied entry is COW over the old object. */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry, fork_charge); vm_map_entry_set_vnode_text(new_entry, true); break; case VM_INHERIT_ZERO: /* * Create a new anonymous mapping entry modelled from * the old one. */ new_entry = vm_map_entry_create(new_map); memset(new_entry, 0, sizeof(*new_entry)); new_entry->start = old_entry->start; new_entry->end = old_entry->end; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC | MAP_ENTRY_SPLIT_BOUNDARY_MASK); new_entry->protection = old_entry->protection; new_entry->max_protection = old_entry->max_protection; new_entry->inheritance = VM_INHERIT_ZERO; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); new_entry->cred = curthread->td_ucred; crhold(new_entry->cred); *fork_charge += (new_entry->end - new_entry->start); break; } } /* * Use inlined vm_map_unlock() to postpone handling the deferred * map entries, which cannot be done until both old_map and * new_map locks are released. */ sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); return (vm2); } /* * Create a process's stack for exec_new_vmspace(). This function is never * asked to wire the newly created stack. */ int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_size_t growsize, init_ssize; rlim_t vmemlim; int rv; MPASS((map->flags & MAP_WIREFUTURE) == 0); growsize = sgrowsiz; init_ssize = (max_ssize < growsize) ? max_ssize : growsize; vm_map_lock(map); vmemlim = lim_cur(curthread, RLIMIT_VMEM); /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { rv = KERN_NO_SPACE; goto out; } rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, max, cow); out: vm_map_unlock(map); return (rv); } static int stack_guard_page = 1; SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, &stack_guard_page, 0, "Specifies the number of guard pages for a stack that grows"); static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, gap_bot, gap_top, top; vm_size_t init_ssize, sgp; int orient, rv; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. */ orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); KASSERT(orient != 0, ("No stack grow direction")); KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), ("bi-dir stack")); if (max_ssize == 0 || !vm_map_range_valid(map, addrbos, addrbos + max_ssize)) return (KERN_INVALID_ADDRESS); sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : (vm_size_t)stack_guard_page * PAGE_SIZE; if (sgp >= max_ssize) return (KERN_INVALID_ARGUMENT); init_ssize = growsize; if (max_ssize < init_ssize + sgp) init_ssize = max_ssize - sgp; /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) return (KERN_NO_SPACE); /* * If we can't accommodate max_ssize in the current mapping, no go. */ if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize) return (KERN_NO_SPACE); /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) { bot = addrbos + max_ssize - init_ssize; top = bot + init_ssize; gap_bot = addrbos; gap_top = bot; } else /* if (orient == MAP_STACK_GROWS_UP) */ { bot = addrbos; top = bot + init_ssize; gap_bot = top; gap_top = addrbos + max_ssize; } rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); if (rv != KERN_SUCCESS) return (rv); new_entry = vm_map_entry_succ(prev_entry); KASSERT(new_entry->end == top || new_entry->start == bot, ("Bad entry start/end for new stack entry")); KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, ("new entry lacks MAP_ENTRY_GROWS_UP")); if (gap_bot == gap_top) return (KERN_SUCCESS); rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); if (rv == KERN_SUCCESS) { /* * Gap can never successfully handle a fault, so * read-ahead logic is never used for it. Re-use * next_read of the gap entry to store * stack_guard_page for vm_map_growstack(). */ if (orient == MAP_STACK_GROWS_DOWN) vm_map_entry_pred(new_entry)->next_read = sgp; else vm_map_entry_succ(new_entry)->next_read = sgp; } else { (void)vm_map_delete(map, bot, top); } return (rv); } /* * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we * successfully grow the stack. */ static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) { vm_map_entry_t stack_entry; struct proc *p; struct vmspace *vm; struct ucred *cred; vm_offset_t gap_end, gap_start, grow_start; vm_size_t grow_amount, guard, max_grow; rlim_t lmemlim, stacklim, vmemlim; int rv, rv1 __diagused; bool gap_deleted, grow_down, is_procstack; #ifdef notyet uint64_t limit; #endif #ifdef RACCT int error __diagused; #endif p = curproc; vm = p->p_vmspace; /* * Disallow stack growth when the access is performed by a * debugger or AIO daemon. The reason is that the wrong * resource limits are applied. */ if (p != initproc && (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)) return (KERN_FAILURE); MPASS(!map->system_map); lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); stacklim = lim_cur(curthread, RLIMIT_STACK); vmemlim = lim_cur(curthread, RLIMIT_VMEM); retry: /* If addr is not in a hole for a stack grow area, no need to grow. */ if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) return (KERN_FAILURE); if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) return (KERN_SUCCESS); if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { stack_entry = vm_map_entry_succ(gap_entry); if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || stack_entry->start != gap_entry->end) return (KERN_FAILURE); grow_amount = round_page(stack_entry->start - addr); grow_down = true; } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { stack_entry = vm_map_entry_pred(gap_entry); if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || stack_entry->end != gap_entry->start) return (KERN_FAILURE); grow_amount = round_page(addr + 1 - stack_entry->end); grow_down = false; } else { return (KERN_FAILURE); } guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : gap_entry->next_read; max_grow = gap_entry->end - gap_entry->start; if (guard > max_grow) return (KERN_NO_SPACE); max_grow -= guard; if (grow_amount > max_grow) return (KERN_NO_SPACE); /* * If this is the main process stack, see if we're over the stack * limit. */ is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && addr < (vm_offset_t)vm->vm_stacktop; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) return (KERN_NO_SPACE); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (is_procstack && racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { PROC_UNLOCK(p); return (KERN_NO_SPACE); } PROC_UNLOCK(p); } #endif grow_amount = roundup(grow_amount, sgrowsiz); if (grow_amount > max_grow) grow_amount = max_grow; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } #ifdef notyet PROC_LOCK(p); limit = racct_get_available(p, RACCT_STACK); PROC_UNLOCK(p); if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) grow_amount = limit - ctob(vm->vm_ssize); #endif if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif if (vm_map_lock_upgrade(map)) { gap_entry = NULL; vm_map_lock_read(map); goto retry; } if (grow_down) { grow_start = gap_entry->end - grow_amount; if (gap_entry->start + grow_amount == gap_entry->end) { gap_start = gap_entry->start; gap_end = gap_entry->end; vm_map_entry_delete(map, gap_entry); gap_deleted = true; } else { MPASS(gap_entry->start < gap_entry->end - grow_amount); vm_map_entry_resize(map, gap_entry, -grow_amount); gap_deleted = false; } rv = vm_map_insert(map, NULL, 0, grow_start, grow_start + grow_amount, stack_entry->protection, stack_entry->max_protection, MAP_STACK_GROWS_DOWN); if (rv != KERN_SUCCESS) { if (gap_deleted) { rv1 = vm_map_insert(map, NULL, 0, gap_start, gap_end, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); MPASS(rv1 == KERN_SUCCESS); } else vm_map_entry_resize(map, gap_entry, grow_amount); } } else { grow_start = stack_entry->end; cred = stack_entry->cred; if (cred == NULL && stack_entry->object.vm_object != NULL) cred = stack_entry->object.vm_object->cred; if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) rv = KERN_NO_SPACE; /* Grow the underlying object if applicable. */ else if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), grow_amount, cred != NULL)) { if (gap_entry->start + grow_amount == gap_entry->end) { vm_map_entry_delete(map, gap_entry); vm_map_entry_resize(map, stack_entry, grow_amount); } else { gap_entry->start += grow_amount; stack_entry->end += grow_amount; } map->size += grow_amount; rv = KERN_SUCCESS; } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { rv = vm_map_wire_locked(map, grow_start, grow_start + grow_amount, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); } vm_map_lock_downgrade(map); out: #ifdef RACCT if (racct_enable && rv != KERN_SUCCESS) { PROC_LOCK(p); error = racct_set(p, RACCT_VMEM, map->size); KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); if (!old_mlock) { error = racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); } error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); KASSERT(error == 0, ("decreasing RACCT_STACK failed")); PROC_UNLOCK(p); } #endif return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ int vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, ("vmspace_exec recursed")); newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); if (newvmspace == NULL) return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); curthread->td_pflags |= TDP_EXECVMSPC; return (0); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ int vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; vm_ooffset_t fork_charge; /* * The caller is responsible for ensuring that the reference count * cannot concurrently transition 1 -> 2. */ if (refcount_load(&oldvmspace->vm_refcnt) == 1) return (0); fork_charge = 0; newvmspace = vmspace_fork(oldvmspace, &fork_charge); if (newvmspace == NULL) return (ENOMEM); if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { vmspace_free(newvmspace); return (ENOMEM); } PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); vmspace_free(oldvmspace); return (0); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type; vm_object_t eobject; vm_size_t size; struct ucred *cred; RetryLookup: vm_map_lock_read(map); RetryLookupLocked: /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } entry = *out_entry; /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { fault_typea &= ~VM_PROT_FAULT_LOOKUP; if (prot == VM_PROT_NONE && map != kernel_map && (entry->eflags & MAP_ENTRY_GUARD) != 0 && (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | MAP_ENTRY_STACK_GAP_UP)) != 0 && vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) goto RetryLookupLocked; } fault_type = fault_typea & VM_PROT_ALL; if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), ("entry %p flags %x", entry, entry->eflags)); if ((fault_typea & VM_PROT_COPY) != 0 && (entry->max_protection & VM_PROT_WRITE) == 0 && (entry->eflags & MAP_ENTRY_COW) == 0) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; size = entry->end - entry->start; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if ((fault_type & VM_PROT_WRITE) != 0 || (fault_typea & VM_PROT_COPY) != 0) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; if (entry->cred == NULL) { /* * The debugger owner is charged for * the memory. */ cred = curthread->td_ucred; crhold(cred); if (!swap_reserve_by_cred(size, cred)) { crfree(cred); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } entry->cred = cred; } eobject = entry->object.vm_object; vm_object_shadow(&entry->object.vm_object, &entry->offset, size, entry->cred, false); if (eobject == entry->object.vm_object) { /* * The object was not shadowed. */ swap_release_by_cred(size, entry->cred); crfree(entry->cred); } entry->cred = NULL; entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate_anon(atop(size), NULL, entry->cred, size); entry->offset = 0; entry->cred = NULL; vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_locked: * * Lookup the faulting address. A version of vm_map_lookup that returns * KERN_FAILURE instead of blocking on map lock or memory allocation. */ int vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) return (KERN_INVALID_ADDRESS); entry = *out_entry; /* * Fail if the entry refers to a submap. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) return (KERN_FAILURE); /* * Check whether this task is allowed to have this page. */ prot = entry->protection; fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * Fail if the entry was copy-on-write for a write fault. */ if (fault_type & VM_PROT_WRITE) return (KERN_FAILURE); /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } /* * Fail if an object should be created. */ if (entry->object.vm_object == NULL && !map->system_map) return (KERN_FAILURE); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } vm_offset_t vm_map_max_KBI(const struct vm_map *map) { return (vm_map_max(map)); } vm_offset_t vm_map_min_KBI(const struct vm_map *map) { return (vm_map_min(map)); } pmap_t vm_map_pmap_KBI(vm_map_t map) { return (map->pmap); } bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end) { return (vm_map_range_valid(map, start, end)); } #ifdef INVARIANTS static void _vm_map_assert_consistent(vm_map_t map, int check) { vm_map_entry_t entry, prev; vm_map_entry_t cur, header, lbound, ubound; vm_size_t max_left, max_right; #ifdef DIAGNOSTIC ++map->nupdates; #endif if (enable_vmmap_check != check) return; header = prev = &map->header; VM_MAP_ENTRY_FOREACH(entry, map) { KASSERT(prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)prev->end, (uintmax_t)entry->start)); KASSERT(entry->start < entry->end, ("map %p start = %jx, end = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->end)); KASSERT(entry->left == header || entry->left->start < entry->start, ("map %p left->start = %jx, start = %jx", map, (uintmax_t)entry->left->start, (uintmax_t)entry->start)); KASSERT(entry->right == header || entry->start < entry->right->start, ("map %p start = %jx, right->start = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->right->start)); cur = map->root; lbound = ubound = header; for (;;) { if (entry->start < cur->start) { ubound = cur; cur = cur->left; KASSERT(cur != lbound, ("map %p cannot find %jx", map, (uintmax_t)entry->start)); } else if (cur->end <= entry->start) { lbound = cur; cur = cur->right; KASSERT(cur != ubound, ("map %p cannot find %jx", map, (uintmax_t)entry->start)); } else { KASSERT(cur == entry, ("map %p cannot find %jx", map, (uintmax_t)entry->start)); break; } } max_left = vm_map_entry_max_free_left(entry, lbound); max_right = vm_map_entry_max_free_right(entry, ubound); KASSERT(entry->max_free == vm_size_max(max_left, max_right), ("map %p max = %jx, max_left = %jx, max_right = %jx", map, (uintmax_t)entry->max_free, (uintmax_t)max_left, (uintmax_t)max_right)); prev = entry; } KASSERT(prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)prev->end, (uintmax_t)entry->start)); } #endif #include "opt_ddb.h" #ifdef DDB #include #include static void vm_map_print(vm_map_t map) { vm_map_entry_t entry, prev; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); db_indent += 2; prev = &map->header; VM_MAP_ENTRY_FOREACH(entry, map) { db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", (void *)entry, (void *)entry->start, (void *)entry->end, entry->eflags); { static const char * const inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char) entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); if (prev == &map->header || prev->object.sub_map != entry->object.sub_map) { db_indent += 2; vm_map_print((vm_map_t)entry->object.sub_map); db_indent -= 2; } } else { if (entry->cred != NULL) db_printf(", ruid %d", entry->cred->cr_ruid); db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->object.vm_object && entry->object.vm_object->cred) db_printf(", obj ruid %d charge %jx", entry->object.vm_object->cred->cr_ruid, (uintmax_t)entry->object.vm_object->charge); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); if (prev == &map->header || prev->object.vm_object != entry->object.vm_object) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, 0, 0, (char *)0); db_indent -= 2; } } prev = entry; } db_indent -= 2; } DB_SHOW_COMMAND(map, map) { if (!have_addr) { db_printf("usage: show map \n"); return; } vm_map_print((vm_map_t)addr); } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = db_lookup_proc(addr); } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((vm_map_t)&p->p_vmspace->vm_map); } #endif /* DDB */ diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 1c02ba35fcfe..56345fcaf560 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1,1682 +1,1680 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 */ /* * Mapped file (mmap) interface to VM */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); static int mincore_mapped = 1; SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, "mincore reports mappings, not residency"); static int imply_prot_max = 0; SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, "Imply maximum page protections in mmap() when none are specified"); #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif int sys_sbrk(struct thread *td, struct sbrk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif int sys_sstk(struct thread *td, struct sstk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) int ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) { td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int sys_mmap(struct thread *td, struct mmap_args *uap) { return (kern_mmap(td, &(struct mmap_req){ .mr_hint = (uintptr_t)uap->addr, .mr_len = uap->len, .mr_prot = uap->prot, .mr_flags = uap->flags, .mr_fd = uap->fd, .mr_pos = uap->pos, })); } int kern_mmap_maxprot(struct proc *p, int prot) { if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) return (_PROT_ALL); if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && prot != PROT_NONE) return (prot); return (_PROT_ALL); } int kern_mmap(struct thread *td, const struct mmap_req *mrp) { struct vmspace *vms; struct file *fp; struct proc *p; off_t pos; vm_offset_t addr, orig_addr; vm_size_t len, pageoff, size; vm_prot_t cap_maxprot; int align, error, fd, flags, max_prot, prot; cap_rights_t rights; mmap_check_fp_fn check_fp_fn; orig_addr = addr = mrp->mr_hint; len = mrp->mr_len; prot = mrp->mr_prot; flags = mrp->mr_flags; fd = mrp->mr_fd; pos = mrp->mr_pos; check_fp_fn = mrp->mr_check_fp_fn; if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) return (EINVAL); max_prot = PROT_MAX_EXTRACT(prot); prot = PROT_EXTRACT(prot); if (max_prot != 0 && (max_prot & prot) != prot) return (ENOTSUP); p = td->td_proc; /* * Always honor PROT_MAX if set. If not, default to all * permissions unless we're implying maximum permissions. */ if (max_prot == 0) max_prot = kern_mmap_maxprot(p, prot); vms = p->p_vmspace; fp = NULL; AUDIT_ARG_FD(fd); /* * Ignore old flags that used to be defined but did not do anything. */ flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); /* * Enforce the constraints. * Mapping of length 0 is only allowed for old binaries. * Anonymous mapping shall specify -1 as filedescriptor and * zero position for new code. Be nice to ancient a.out * binaries and correct pos for anonymous mapping, since old * ld.so sometimes issues anonymous map requests with non-zero * pos. */ if (!SV_CURPROC_FLAG(SV_AOUT)) { if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) return (EINVAL); } else { if ((flags & MAP_ANON) != 0) pos = 0; } if (flags & MAP_STACK) { if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | MAP_GUARD | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) return (EINVAL); if (prot != PROT_NONE && (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) return (EINVAL); if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0)) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Compute size from len by rounding (on both ends). */ size = len + pageoff; /* low end... */ size = round_page(size); /* hi end */ /* Check for rounding up to zero. */ if (len > size) return (ENOMEM); /* Ensure alignment is at least a page and fits in a pointer. */ align = flags & MAP_ALIGNMENT_MASK; if (align != 0 && align != MAP_ALIGNED_SUPER && (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) return (EINVAL); #ifdef MAP_32BIT if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) return (EINVAL); } else if (flags & MAP_32BIT) { /* * For MAP_32BIT, override the hint if it is too high and * do not bother moving the mapping past the heap (since * the heap is usually above 2GB). */ if (addr + size > MAP_32BIT_MAX_ADDR) addr = 0; #endif } else { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)); } if (len == 0) { /* * Return success without mapping anything for old * binaries that request a page-aligned mapping of * length 0. For modern binaries, this function * returns an error earlier. */ error = 0; } else if ((flags & MAP_GUARD) != 0) { error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, VM_PROT_NONE, flags, NULL, pos, FALSE, td); } else if ((flags & MAP_ANON) != 0) { /* * Mapping blank space is trivial. * * This relies on VM_PROT_* matching PROT_*. */ error = vm_mmap_object(&vms->vm_map, &addr, size, prot, max_prot, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the * descriptor disappear on us if we block. Check capability * rights, but also return the maximum rights to be combined * with maxprot later. */ cap_rights_init_one(&rights, CAP_MMAP); if (prot & PROT_READ) cap_rights_set_one(&rights, CAP_MMAP_R); if ((flags & MAP_SHARED) != 0) { if (prot & PROT_WRITE) cap_rights_set_one(&rights, CAP_MMAP_W); } if (prot & PROT_EXEC) cap_rights_set_one(&rights, CAP_MMAP_X); error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); if (error != 0) goto done; if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && p->p_osrel >= P_OSREL_MAP_FSTRICT) { error = EINVAL; goto done; } if (check_fp_fn != NULL) { error = check_fp_fn(fp, prot, max_prot & cap_maxprot, flags); if (error != 0) goto done; } if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) addr = orig_addr; /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, max_prot & cap_maxprot, flags, pos, td); } if (error == 0) td->td_retval[0] = addr + pageoff; done: if (fp) fdrop(fp, td); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) { return (kern_mmap(td, &(struct mmap_req){ .mr_hint = (uintptr_t)uap->addr, .mr_len = uap->len, .mr_prot = uap->prot, .mr_flags = uap->flags, .mr_fd = uap->fd, .mr_pos = uap->pos, })); } #endif #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(struct thread *td, struct ommap_args *uap) { return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } int kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, int oflags, int fd, long pos) { static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; int flags, prot; if (len < 0) return (EINVAL); #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 prot = cvtbsdprot[oprot & 0x7]; #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && prot != 0) prot |= PROT_EXEC; #endif flags = 0; if (oflags & OMAP_ANON) flags |= MAP_ANON; if (oflags & OMAP_COPY) flags |= MAP_COPY; if (oflags & OMAP_SHARED) flags |= MAP_SHARED; else flags |= MAP_PRIVATE; if (oflags & OMAP_FIXED) flags |= MAP_FIXED; return (kern_mmap(td, &(struct mmap_req){ .mr_hint = hint, .mr_len = len, .mr_prot = prot, .mr_flags = flags, .mr_fd = fd, .mr_pos = pos, })); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; size_t len; int flags; }; #endif int sys_msync(struct thread *td, struct msync_args *uap) { return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); } int kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) { vm_offset_t addr; vm_size_t pageoff; vm_map_t map; int rv; addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &td->td_proc->p_vmspace->vm_map; /* * Clean the pages and interpret the return value. */ rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: return (ENOMEM); case KERN_INVALID_ARGUMENT: return (EBUSY); case KERN_FAILURE: return (EIO); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif int sys_munmap(struct thread *td, struct munmap_args *uap) { return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); } int kern_munmap(struct thread *td, uintptr_t addr0, size_t size) { #ifdef HWPMC_HOOKS struct pmckern_map_out pkm; vm_map_entry_t entry; bool pmc_handled; #endif vm_offset_t addr, end; vm_size_t pageoff; vm_map_t map; int rv; if (size == 0) return (EINVAL); addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); end = addr + size; map = &td->td_proc->p_vmspace->vm_map; if (!vm_map_range_valid(map, addr, end)) return (EINVAL); vm_map_lock(map); #ifdef HWPMC_HOOKS pmc_handled = false; if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { pmc_handled = true; /* * Inform hwpmc if the address range being unmapped contains * an executable region. */ pkm.pm_address = (uintptr_t) NULL; if (vm_map_lookup_entry(map, addr, &entry)) { for (; entry->start < end; entry = vm_map_entry_succ(entry)) { if (vm_map_check_protection(map, entry->start, entry->end, VM_PROT_EXECUTE) == TRUE) { pkm.pm_address = (uintptr_t) addr; pkm.pm_size = (size_t) size; break; } } } } #endif rv = vm_map_delete(map, addr, end); #ifdef HWPMC_HOOKS if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); vm_map_unlock_read(map); } else #endif vm_map_unlock(map); return (vm_mmap_to_errno(rv)); } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif int sys_mprotect(struct thread *td, struct mprotect_args *uap) { return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); } int kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) { vm_offset_t addr; vm_size_t pageoff; int vm_error, max_prot; int flags; addr = addr0; if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) return (EINVAL); max_prot = PROT_MAX_EXTRACT(prot); prot = PROT_EXTRACT(prot); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { if (((addr + size) & 0xffffffff) < addr) return (EINVAL); } else #endif if (addr + size < addr) return (EINVAL); flags = VM_MAP_PROTECT_SET_PROT; if (max_prot != 0) flags |= VM_MAP_PROTECT_SET_MAXPROT; vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, max_prot, flags); switch (vm_error) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); case KERN_RESOURCE_SHORTAGE: return (ENOMEM); case KERN_OUT_OF_BOUNDS: return (ENOTSUP); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif int sys_minherit(struct thread *td, struct minherit_args *uap) { return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, uap->inherit)); } int kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; addr = (vm_offset_t)addr0; size = len; inherit = inherit0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr + size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif int sys_madvise(struct thread *td, struct madvise_args *uap) { return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); } int kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) { vm_map_t map; vm_offset_t addr, end, start; int flags; /* * Check for our special case, advising the swap pager we are * "immortal." */ if (behav == MADV_PROTECT) { flags = PPROT_SET; return (kern_procctl(td, P_PID, td->td_proc->p_pid, PROC_SPROTECT, &flags)); } /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; addr = addr0; if (!vm_map_range_valid(map, addr, addr + len)) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page(addr); end = round_page(addr + len); /* * vm_map_madvise() checks for illegal values of behav. */ return (vm_map_madvise(map, start, end, behav)); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif int sys_mincore(struct thread *td, struct mincore_args *uap) { return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); } int kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) { pmap_t pmap; vm_map_t map; vm_map_entry_t current, entry; vm_object_t object; vm_offset_t addr, cend, end, first_addr; vm_paddr_t pa; vm_page_t m; vm_pindex_t pindex; int error, lastvecindex, mincoreinfo, vecindex; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page(addr0); end = round_page(addr0 + len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (ENOMEM); pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) { vm_map_unlock_read(map); return (ENOMEM); } /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; while (entry->start < end) { /* * check for contiguity */ current = entry; entry = vm_map_entry_succ(current); if (current->end < end && entry->start > current->end) { vm_map_unlock_read(map); return (ENOMEM); } /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; for (; addr < cend; addr += PAGE_SIZE) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ m = NULL; object = NULL; retry: pa = 0; mincoreinfo = pmap_mincore(pmap, addr, &pa); if (mincore_mapped) { /* * We only care about this pmap's * mapping of the page, if any. */ ; } else if (pa != 0) { /* * The page is mapped by this process but not * both accessed and modified. It is also * managed. Acquire the object lock so that * other mappings might be examined. The page's * identity may change at any point before its * object lock is acquired, so re-validate if * necessary. */ m = PHYS_TO_VM_PAGE(pa); while (object == NULL || m->object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (object == NULL) goto retry; VM_OBJECT_WLOCK(object); } if (pa != pmap_extract(pmap, addr)) goto retry; KASSERT(vm_page_all_valid(m), ("mincore: page %p is mapped but invalid", m)); } else if (mincoreinfo == 0) { /* * The page is not mapped by this process. If * the object implements managed pages, then * determine if the page is resident so that * the mappings might be examined. */ if (current->object.vm_object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = current->object.vm_object; VM_OBJECT_WLOCK(object); } - if (object->type == OBJT_DEFAULT || - (object->flags & OBJ_SWAP) != 0 || + if ((object->flags & OBJ_SWAP) != 0 || object->type == OBJT_VNODE) { pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); if (m != NULL && vm_page_none_valid(m)) m = NULL; if (m != NULL) mincoreinfo = MINCORE_INCORE; } } if (m != NULL) { VM_OBJECT_ASSERT_WLOCKED(m->object); /* Examine other mappings of the page. */ if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty != 0) mincoreinfo |= MINCORE_MODIFIED_OTHER; /* * The first test for PGA_REFERENCED is an * optimization. The second test is * required because a concurrent pmap * operation could clear the last reference * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ if ((m->a.flags & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || (m->a.flags & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) VM_OBJECT_WUNLOCK(object); /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = atop(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * Pass the page information to the user */ error = subyte(vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = atop(end - first_addr); while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif int sys_mlock(struct thread *td, struct mlock_args *uap) { return (kern_mlock(td->td_proc, td->td_ucred, __DECONST(uintptr_t, uap->addr), uap->len)); } int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) { vm_offset_t addr, end, last, start; vm_size_t npages, size; vm_map_t map; unsigned long nsize; int error; error = priv_check_cred(cred, PRIV_VM_MLOCK); if (error) return (error); addr = addr0; size = len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_user_wired) return (ENOMEM); map = &proc->p_vmspace->vm_map; PROC_LOCK(proc); nsize = ptoa(npages + pmap_wired_count(map->pmap)); if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); #ifdef RACCT if (racct_enable) { PROC_LOCK(proc); error = racct_set(proc, RACCT_MEMLOCK, nsize); PROC_UNLOCK(proc); if (error != 0) return (ENOMEM); } #endif error = vm_map_wire(map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(proc); racct_set(proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(proc); } #endif switch (error) { case KERN_SUCCESS: return (0); case KERN_INVALID_ARGUMENT: return (EINVAL); default: return (ENOMEM); } } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif int sys_mlockall(struct thread *td, struct mlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MLOCK); if (error) return (error); if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) return (EINVAL); /* * If wiring all pages in the process would cause it to exceed * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) return (ENOMEM); } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); PROC_UNLOCK(td->td_proc); if (error != 0) return (ENOMEM); } #endif if (uap->how & MCL_FUTURE) { vm_map_lock(map); vm_map_modflags(map, MAP_WIREFUTURE, 0); vm_map_unlock(map); error = 0; } if (uap->how & MCL_CURRENT) { /* * P1003.1-2001 mandates that all currently mapped pages * will be memory resident and locked (wired) upon return * from mlockall(). vm_map_wire() will wire pages, by * calling vm_fault_wire() for each page in the region. */ error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); if (error == KERN_SUCCESS) error = 0; else if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EAGAIN; } #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlockall_args { register_t dummy; }; #endif int sys_munlockall(struct thread *td, struct munlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); /* Clear the MAP_WIREFUTURE flag from this vm_map. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, 0); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif int sys_munlock(struct thread *td, struct munlock_args *uap) { return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); } int kern_munlock(struct thread *td, uintptr_t addr0, size_t size) { vm_offset_t addr, end, last, start; #ifdef RACCT vm_map_t map; #endif int error; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); addr = addr0; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); map = &td->td_proc->p_vmspace->vm_map; racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap_vnode() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_ooffset_t foff; struct ucred *cred; int error, flags; bool writex; cred = td->td_ucred; writex = (*maxprotp & VM_PROT_WRITE) != 0 && (*flagsp & MAP_SHARED) != 0; if ((error = vget(vp, LK_SHARED)) != 0) return (error); AUDIT_ARG_VNODE1(vp); foff = *foffp; flags = *flagsp; obj = vp->v_object; if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (obj == NULL) { error = EINVAL; goto done; } if (obj->type == OBJT_VNODE && obj->handle != vp) { vput(vp); vp = (struct vnode *)obj->handle; /* * Bypass filesystems obey the mpsafety of the * underlying fs. Tmpfs never bypasses. */ error = vget(vp, LK_SHARED); if (error != 0) return (error); } if (writex) { *writecounted = TRUE; vm_pager_update_writecount(obj, 0, objsize); } } else { error = EINVAL; goto done; } if ((error = VOP_GETATTR(vp, &va, cred))) goto done; #ifdef MAC /* This relies on VM_PROT_* matching PROT_*. */ error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); if (error != 0) goto done; #endif if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { if (prot & VM_PROT_WRITE) { error = EPERM; goto done; } *maxprotp &= ~VM_PROT_WRITE; } } /* * If it is a regular file without any references * we do not need to sync it. * Adjust object size to be the size of actual file. */ objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; if (obj->type == OBJT_VNODE) { obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); if (obj == NULL) { error = ENOMEM; goto done; } } else { - KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP || - (obj->flags & OBJ_SWAP) != 0, ("wrong object type")); + KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); vm_object_reference(obj); #if VM_NRESERVLEVEL > 0 if ((obj->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(obj); vm_object_color(obj, 0); VM_OBJECT_WUNLOCK(obj); } #endif } *objp = obj; *flagsp = flags; VOP_MMAPPED(vp); done: if (error != 0 && *writecounted) { *writecounted = FALSE; vm_pager_update_writecount(obj, objsize, 0); } vput(vp); return (error); } /* * vm_mmap_cdev() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on cdevs. */ int vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, vm_ooffset_t *foff, vm_object_t *objp) { vm_object_t obj; int error, flags; flags = *flagsp; if (dsw->d_flags & D_MMAP_ANON) { *objp = NULL; *foff = 0; *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; return (0); } /* * cdevs do not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if (flags & (MAP_PRIVATE|MAP_COPY)) return (EINVAL); /* * Force device mappings to be shared. */ flags |= MAP_SHARED; #ifdef MAC_XXX error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); if (error != 0) return (error); #endif /* * First, try d_mmap_single(). If that is not implemented * (returns ENODEV), fall back to using the device pager. * Note that d_mmap_single() must return a reference to the * object (it needs to bump the reference count of the object * it returns somehow). * * XXX assumes VM_PROT_* == PROT_* */ error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); if (error != ENODEV) return (error); obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, td->td_ucred); if (obj == NULL) return (EINVAL); *objp = obj; *flagsp = flags; return (0); } int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, objtype_t handle_type, void *handle, vm_ooffset_t foff) { vm_object_t object; struct thread *td = curthread; int error; boolean_t writecounted; if (size == 0) return (EINVAL); size = round_page(size); object = NULL; writecounted = FALSE; switch (handle_type) { case OBJT_DEVICE: { struct cdevsw *dsw; struct cdev *cdev; int ref; cdev = handle; dsw = dev_refthread(cdev, &ref); if (dsw == NULL) return (ENXIO); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, dsw, &foff, &object); dev_relthread(cdev, ref); break; } case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, handle, &foff, &object, &writecounted); break; default: error = EINVAL; break; } if (error) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0 && object != NULL) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } return (error); } int kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) { int error; RACCT_PROC_LOCK(td->td_proc); if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > lim_cur(td, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + size); if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); RACCT_PROC_UNLOCK(td->td_proc); return (error); } } RACCT_PROC_UNLOCK(td->td_proc); return (0); } /* * Internal version of mmap that maps a specific VM object into an * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. */ int vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, boolean_t writecounted, struct thread *td) { vm_offset_t max_addr; int docow, error, findspace, rv; bool curmap, fitit; curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { error = kern_mmap_racct_check(td, map, size); if (error != 0) return (error); } /* * We currently can only deal with page aligned file offsets. * The mmap() system call already enforces this by subtracting * the page offset from the file offset, but checking here * catches errors in device drivers (e.g. d_single_mmap() * callbacks) and other internal mapping requests (such as in * exec). */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; } if (flags & MAP_ANON) { if (object != NULL || foff != 0) return (EINVAL); docow = 0; } else if (flags & MAP_PREFAULT_READ) docow = MAP_PREFAULT; else docow = MAP_PREFAULT_PARTIAL; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; if (writecounted) docow |= MAP_WRITECOUNT; if (flags & MAP_STACK) { if (object != NULL) return (EINVAL); docow |= MAP_STACK_GROWS_DOWN; } if ((flags & MAP_EXCL) != 0) docow |= MAP_CHECK_EXCL; if ((flags & MAP_GUARD) != 0) docow |= MAP_CREATE_GUARD; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) findspace = VMFS_SUPER_SPACE; else if ((flags & MAP_ALIGNMENT_MASK) != 0) findspace = VMFS_ALIGNED_SPACE(flags >> MAP_ALIGNMENT_SHIFT); else findspace = VMFS_OPTIMAL_SPACE; max_addr = 0; #ifdef MAP_32BIT if ((flags & MAP_32BIT) != 0) max_addr = MAP_32BIT_MAX_ADDR; #endif if (curmap) { rv = vm_map_find_min(map, object, foff, addr, size, round_page((vm_offset_t)td->td_proc->p_vmspace-> vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, findspace, prot, maxprot, docow); } else { rv = vm_map_find(map, object, foff, addr, size, max_addr, findspace, prot, maxprot, docow); } } else { rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); } if (rv == KERN_SUCCESS) { /* * If the process has requested that all future mappings * be wired, then heed this. */ if ((map->flags & MAP_WIREFUTURE) != 0) { vm_map_lock(map); if ((map->flags & MAP_WIREFUTURE) != 0) (void)vm_map_wire_locked(map, *addr, *addr + size, VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); vm_map_unlock(map); } } return (vm_mmap_to_errno(rv)); } /* * Translate a Mach VM return code to zero on success or the appropriate errno * on failure. */ int vm_mmap_to_errno(int rv) { switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index bb29568ab5e4..7fd82239c13b 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1,2888 +1,2878 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean); static void vm_object_backing_remove(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM object stats"); static COUNTER_U64_DEFINE_EARLY(object_collapses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, "VM object collapses"); static COUNTER_U64_DEFINE_EARLY(object_bypasses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, "VM object bypasses"); static COUNTER_U64_DEFINE_EARLY(object_collapse_waits); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD, &object_collapse_waits, "Number of sleeps for collapse"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(!vm_object_busied(object), ("object %p busy = %d", object, blockcount_read(&object->busy))); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(atomic_load_int(&object->shadow_count) == 0, ("object %p shadow_count = %d", object, atomic_load_int(&object->shadow_count))); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); KASSERT(object->charge == 0 && object->cred == NULL, ("object %p has non-zero charge %ju (%p)", object, (uintmax_t)object->charge, object->cred)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; vm_radix_init(&object->rtree); refcount_init(&object->ref_count, 0); blockcount_init(&object->paging_in_progress); blockcount_init(&object->busy); object->resident_page_count = 0; atomic_store_int(&object->shadow_count, 0); object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, u_short flags, vm_object_t object, void *handle) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; object->flags = flags; if ((flags & OBJ_SWAP) != 0) { pctrie_init(&object->un_pager.swp.swp_blks); object->un_pager.swp.writemappings = 0; } /* * Ensure that swap_pager_swapoff() iteration over object_list * sees up to date type and pctrie head if it observed * non-dead object. */ atomic_thread_fence_rel(); object->pg_color = 0; object->size = size; object->domain.dr_policy = NULL; object->generation = 1; object->cleangeneration = 1; refcount_init(&object->ref_count, 1); object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = handle; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), OBJ_UNMANAGED, kernel_object, NULL); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif kernel_object->un_pager.phys.ops = &default_phys_pg_ops; /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. * * paging_in_progress is valid always. Lockless references to * the objects may acquire pip and then check OBJ_DEAD. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_DEAD) return (KERN_INVALID_ARGUMENT); if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { if (i > 0) blockcount_acquire(&object->paging_in_progress, i); } void vm_object_pip_wakeup(vm_object_t object) { vm_object_pip_wakeupn(object, 1); } void vm_object_pip_wakeupn(vm_object_t object, short i) { if (i > 0) blockcount_release(&object->paging_in_progress, i); } /* * Atomically drop the object lock and wait for pip to drain. This protects * from sleep/wakeup races due to identity changes. The lock is not re-acquired * on return. */ static void vm_object_pip_sleep(vm_object_t object, const char *waitid) { (void)blockcount_sleep(&object->paging_in_progress, &object->lock, waitid, PVM | PDROP); } void vm_object_pip_wait(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); blockcount_wait(&object->paging_in_progress, &object->lock, waitid, PVM); } void vm_object_pip_wait_unlocked(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_UNLOCKED(object); blockcount_wait(&object->paging_in_progress, NULL, waitid, PVM); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; u_short flags; switch (type) { case OBJT_DEAD: panic("vm_object_allocate: can't create OBJT_DEAD"); - case OBJT_DEFAULT: - flags = OBJ_COLORED; - break; case OBJT_SWAP: flags = OBJ_COLORED | OBJ_SWAP; break; case OBJT_DEVICE: case OBJT_SG: flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: flags = OBJ_UNMANAGED; break; case OBJT_VNODE: flags = 0; break; default: panic("vm_object_allocate: type %d is undefined or dynamic", type); } object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, flags, object, NULL); return (object); } vm_object_t vm_object_allocate_dyn(objtype_t dyntype, vm_pindex_t size, u_short flags) { vm_object_t object; MPASS(dyntype >= OBJT_FIRST_DYN /* && dyntype < nitems(pagertab) */); object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(dyntype, size, flags, object, NULL); return (object); } /* * vm_object_allocate_anon: * * Returns a new default object of the given size and marked as * anonymous memory for special split/collapse handling. Color * to be initialized by the caller. */ vm_object_t vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object, struct ucred *cred, vm_size_t charge) { vm_object_t handle, object; if (backing_object == NULL) handle = NULL; else if ((backing_object->flags & OBJ_ANON) != 0) handle = backing_object->handle; else handle = backing_object; object = uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(OBJT_SWAP, size, OBJ_ANON | OBJ_ONEMAPPING | OBJ_SWAP, object, handle); object->cred = cred; object->charge = cred != NULL ? charge : 0; return (object); } static void vm_object_reference_vnode(vm_object_t object) { u_int old; /* * vnode objects need the lock for the first reference * to serialize with vnode_object_deallocate(). */ if (!refcount_acquire_if_gt(&object->ref_count, 0)) { VM_OBJECT_RLOCK(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); VM_OBJECT_RUNLOCK(object); } } /* * vm_object_reference: * * Acquires a reference to the given object. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; if (object->type == OBJT_VNODE) vm_object_reference_vnode(object); else refcount_acquire(&object->ref_count); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { u_int old; VM_OBJECT_ASSERT_LOCKED(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_deallocate_vnode(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; bool last; KASSERT(object->type == OBJT_VNODE, ("vm_object_deallocate_vnode: not a vnode object")); KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp")); /* Object lock to protect handle lookup. */ last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (!last) return; if (!umtx_shm_vnobj_persistent) umtx_shm_object_terminated(object); /* vrele may need the vnode lock. */ vrele(vp); } /* * We dropped a reference on an object and discovered that it had a * single remaining shadow. This is a sibling of the reference we * dropped. Attempt to collapse the sibling and backing object. */ static vm_object_t vm_object_deallocate_anon(vm_object_t backing_object) { vm_object_t object; /* Fetch the final shadow. */ object = LIST_FIRST(&backing_object->shadow_head); KASSERT(object != NULL && atomic_load_int(&backing_object->shadow_count) == 1, ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d", backing_object->ref_count, atomic_load_int(&backing_object->shadow_count))); KASSERT((object->flags & OBJ_ANON) != 0, ("invalid shadow object %p", object)); if (!VM_OBJECT_TRYWLOCK(object)) { /* * Prevent object from disappearing since we do not have a * reference. */ vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); } else VM_OBJECT_WUNLOCK(backing_object); /* * Check for a collapse/terminate race with the last reference holder. */ if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 || !refcount_acquire_if_not_zero(&object->ref_count)) { VM_OBJECT_WUNLOCK(object); return (NULL); } backing_object = object->backing_object; if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0) vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); return (object); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; bool released; while (object != NULL) { /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. A ref count * of 1 may be a special case depending on the shadow count * being 0 or 1. These cases require a write lock on the * object. */ if ((object->flags & OBJ_ANON) == 0) released = refcount_release_if_gt(&object->ref_count, 1); else released = refcount_release_if_gt(&object->ref_count, 2); if (released) return; if (object->type == OBJT_VNODE) { VM_OBJECT_RLOCK(object); if (object->type == OBJT_VNODE) { vm_object_deallocate_vnode(object); return; } VM_OBJECT_RUNLOCK(object); } VM_OBJECT_WLOCK(object); KASSERT(object->ref_count > 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If this is not the final reference to an anonymous * object we may need to collapse the shadow chain. */ if (!refcount_release(&object->ref_count)) { if (object->ref_count > 1 || atomic_load_int(&object->shadow_count) == 0) { if ((object->flags & OBJ_ANON) != 0 && object->ref_count == 1) vm_object_set_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); return; } /* Handle collapsing last ref on anonymous objects. */ object = vm_object_deallocate_anon(object); continue; } /* * Handle the final reference to an object. We restart * the loop with the backing object to avoid recursion. */ umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { - KASSERT(object->type == OBJT_DEFAULT || - object->type == OBJT_SWAP, + KASSERT(object->type == OBJT_SWAP, ("shadowed tmpfs v_object 2 %p", object)); vm_object_backing_remove(object); } KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_deallocate: Terminating dead object.")); vm_object_set_flag(object, OBJ_DEAD); vm_object_terminate(object); object = temp; } } void vm_object_destroy(vm_object_t object) { uma_zfree(obj_zone, object); } static void vm_object_sub_shadow(vm_object_t object) { KASSERT(object->shadow_count >= 1, ("object %p sub_shadow count zero", object)); atomic_subtract_int(&object->shadow_count, 1); } static void vm_object_backing_remove_locked(vm_object_t object) { vm_object_t backing_object; backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("vm_object_backing_remove: Removing collapsing object.")); vm_object_sub_shadow(backing_object); if ((object->flags & OBJ_SHADOWLIST) != 0) { LIST_REMOVE(object, shadow_list); vm_object_clear_flag(object, OBJ_SHADOWLIST); } object->backing_object = NULL; } static void vm_object_backing_remove(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; if ((object->flags & OBJ_SHADOWLIST) != 0) { VM_OBJECT_WLOCK(backing_object); vm_object_backing_remove_locked(object); VM_OBJECT_WUNLOCK(backing_object); } else { object->backing_object = NULL; vm_object_sub_shadow(backing_object); } } static void vm_object_backing_insert_locked(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); atomic_add_int(&backing_object->shadow_count, 1); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_ASSERT_WLOCKED(backing_object); LIST_INSERT_HEAD(&backing_object->shadow_head, object, shadow_list); vm_object_set_flag(object, OBJ_SHADOWLIST); } object->backing_object = backing_object; } static void vm_object_backing_insert(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); vm_object_backing_insert_locked(object, backing_object); VM_OBJECT_WUNLOCK(backing_object); } else { object->backing_object = backing_object; atomic_add_int(&backing_object->shadow_count, 1); } } /* * Insert an object into a backing_object's shadow list with an additional * reference to the backing_object added. */ static void vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); KASSERT((backing_object->flags & OBJ_DEAD) == 0, ("shadowing dead anonymous object")); vm_object_reference_locked(backing_object); vm_object_backing_insert_locked(object, backing_object); vm_object_clear_flag(backing_object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(backing_object); } else { vm_object_reference(backing_object); atomic_add_int(&backing_object->shadow_count, 1); object->backing_object = backing_object; } } /* * Transfer a backing reference from backing_object to object. */ static void vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object) { vm_object_t new_backing_object; /* * Note that the reference to backing_object->backing_object * moves from within backing_object to within object. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object == NULL) return; if ((new_backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(new_backing_object); vm_object_backing_remove_locked(backing_object); vm_object_backing_insert_locked(object, new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); } else { /* * shadow_count for new_backing_object is left * unchanged, its reference provided by backing_object * is replaced by object. */ object->backing_object = new_backing_object; backing_object->backing_object = NULL; } } /* * Wait for a concurrent collapse to settle. */ static void vm_object_collapse_wait(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); while ((object->flags & OBJ_COLLAPSING) != 0) { vm_object_pip_wait(object, "vmcolwait"); counter_u64_add(object_collapse_waits, 1); } } /* * Waits for a backing object to clear a pending collapse and returns * it locked if it is an ANON object. */ static vm_object_t vm_object_backing_collapse_wait(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); for (;;) { backing_object = object->backing_object; if (backing_object == NULL || (backing_object->flags & OBJ_ANON) == 0) return (NULL); VM_OBJECT_WLOCK(backing_object); if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0) break; VM_OBJECT_WUNLOCK(object); vm_object_pip_sleep(backing_object, "vmbckwait"); counter_u64_add(object_collapse_waits, 1); VM_OBJECT_WLOCK(object); } return (backing_object); } /* * vm_object_terminate_pages removes any remaining pageable pages * from the object and resets the object to an empty state. */ static void vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); KASSERT(p->object == object && (p->ref_count & VPRC_OBJREF) != 0, ("vm_object_terminate_pages: page %p is inconsistent", p)); p->object = NULL; if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) { VM_CNT_INC(v_pfree); vm_page_free(p); } } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("terminating non-dead obj %p", object)); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("terminating collapsing obj %p", object)); KASSERT(object->backing_object == NULL, ("terminating shadow obj %p", object)); /* * Wait for the pageout daemon and other current users to be * done with the object. Note that new paging_in_progress * users can come after this wait, but they must check * OBJ_DEAD flag set (without unlocking the object), and avoid * the object being terminated. */ vm_object_pip_wait(object, "objtrm"); KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif - KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || - (object->flags & OBJ_SWAP) != 0, + KASSERT(object->cred == NULL || (object->flags & OBJ_SWAP) != 0, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean) { vm_page_assert_busied(p); /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->a.flags & PGA_NOSYNC) != 0) { *allclean = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PGA_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * For swap objects backing tmpfs regular files, do not flush anything, * but remove write protection on the mapped pages to update mtime through * mmaped writes. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t eio, res, allclean; VM_OBJECT_ASSERT_WLOCKED(object); if (!vm_object_mightbedirty(object) || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); allclean = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (vm_page_none_valid(p)) continue; if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) { if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &allclean)) { vm_page_xunbusy(p); continue; } if (object->type == OBJT_VNODE) { n = vm_object_page_collect_flush(object, p, pagerflags, flags, &allclean, &eio); if (eio) { res = FALSE; allclean = FALSE; } if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; allclean = FALSE; } } else { n = 1; vm_page_xunbusy(p); } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif /* * Leave updating cleangeneration for tmpfs objects to tmpfs * scan. It needs to update mtime, which happens for other * filesystems during page writeouts. */ if (allclean && object->type == OBJT_VNODE) object->cleangeneration = curgeneration; return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); vm_page_assert_xbusied(p); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && vm_object_mightbedirty(object) != 0 && ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->flags & (OBJ_ONEMAPPING | OBJ_ANON)) == (OBJ_ONEMAPPING | OBJ_ANON)); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE) vm_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * - * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, - * OBJ_ONEMAPPING only) + * MADV_FREE (OBJT_SWAP objects, OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. The page * can not be invalidated while the object lock is held. */ if (!vm_page_all_valid(tm) || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_tryxbusy(tm) == 0) { if (object != tobject) VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } if (!vm_page_busy_sleep(tm, "madvpo", 0)) VM_OBJECT_WUNLOCK(tobject); goto relookup; } vm_page_advise(tm, advice); vm_page_xunbusy(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length, struct ucred *cred, bool shared) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. * * If we hold the only reference we can guarantee that it won't * increase while we have the map locked. Otherwise the race is * harmless and we will end up with an extra shadow object that * will be collapsed later. */ if (source != NULL && source->ref_count == 1 && (source->flags & OBJ_ANON) != 0) return; /* * Allocate a new object with the given length. */ result = vm_object_allocate_anon(atop(length), source, cred, length); /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (shared || source != NULL) { VM_OBJECT_WLOCK(result); /* * The new object shadows the source object, adding a * reference to it. Our caller changes his reference * to point to the new object, removing a reference to * the source object. Net result: no change of * reference count, unless the caller needs to add one * more reference due to forking a shared map entry. */ if (shared) { vm_object_reference_locked(result); vm_object_clear_flag(result, OBJ_ONEMAPPING); } /* * Try to optimize the result object's page color when * shadowing in order to maintain page coloring * consistency in the combined shadowed object. */ if (source != NULL) { vm_object_backing_insert(result, source); result->domain = source->domain; #if VM_NRESERVLEVEL > 0 vm_object_set_flag(result, (source->flags & OBJ_COLORED)); result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif } VM_OBJECT_WUNLOCK(result); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_busy, m_next; vm_object_t orig_object, new_object, backing_object; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0, ("vm_object_split: Splitting object with multiple mappings.")); if ((orig_object->flags & OBJ_ANON) == 0) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); - /* - * If swap_pager_copy() is later called, it will convert new_object - * into a swap object. - */ new_object = vm_object_allocate_anon(size, orig_object, orig_object->cred, ptoa(size)); /* * We must wait for the orig_object to complete any in-progress * collapse so that the swap blocks are stable below. The * additional reference on backing_object by new object will * prevent further collapse operations until split completes. */ VM_OBJECT_WLOCK(orig_object); vm_object_collapse_wait(orig_object); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); new_object->domain = orig_object->domain; backing_object = orig_object->backing_object; if (backing_object != NULL) { vm_object_backing_insert_ref(new_object, backing_object); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; } if (orig_object->cred != NULL) { crhold(orig_object->cred); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } /* * Mark the split operation so that swap_pager_getpages() knows * that the object is in transition. */ vm_object_set_flag(orig_object, OBJ_SPLIT); m_busy = NULL; #ifdef INVARIANTS idx = 0; #endif retry: m = vm_page_find_least(orig_object, offidxstart); KASSERT(m == NULL || idx <= m->pindex - offidxstart, ("%s: object %p was repopulated", __func__, orig_object)); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_tryxbusy(m) == 0) { VM_OBJECT_WUNLOCK(new_object); if (vm_page_busy_sleep(m, "spltwt", 0)) VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* * The page was left invalid. Likely placed there by * an incomplete fault. Just remove and ignore. */ if (vm_page_none_valid(m)) { if (vm_page_remove(m)) vm_page_free(m); continue; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { vm_page_xunbusy(m); VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif /* * orig_object's type may change while sleeping, so keep track * of the beginning of the busied range. */ if (orig_object->type != OBJT_SWAP) vm_page_xunbusy(m); else if (m_busy == NULL) m_busy = m; } if ((orig_object->flags & OBJ_SWAP) != 0) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); if (m_busy != NULL) TAILQ_FOREACH_FROM(m_busy, &new_object->memq, listq) vm_page_xunbusy(m_busy); } vm_object_clear_flag(orig_object, OBJ_SPLIT); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); /* The page is only NULL when rename fails. */ if (p == NULL) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } else if (p->object == object) { VM_OBJECT_WUNLOCK(backing_object); if (vm_page_busy_sleep(p, "vmocol", 0)) VM_OBJECT_WLOCK(object); } else { VM_OBJECT_WUNLOCK(object); if (!vm_page_busy_sleep(p, "vmocol", 0)) VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WLOCK(object); } VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if ((backing_object->flags & OBJ_ANON) == 0) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; if (p != NULL) { /* * If the backing object page is busy a * grandparent or older page may still be * undergoing CoW. It is not safe to collapse * the backing object until it is quiesced. */ if (vm_page_tryxbusy(p) == 0) return (false); /* * We raced with the fault handler that left * newly allocated invalid page on the object * queue and retried. */ if (!vm_page_all_valid(p)) goto unbusy_ret; } /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); /* * The valid check here is stable due to object lock * being required to clear valid and initiate paging. * Busy of p disallows fault handler to validate pp. */ if ((pp == NULL || vm_page_none_valid(pp)) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) goto unbusy_ret; if (p != NULL) vm_page_xunbusy(p); } return (true); unbusy_ret: if (p != NULL) vm_page_xunbusy(p); return (false); } static void vm_object_collapse_scan(vm_object_t object) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_tryxbusy(p) == 0) { next = vm_object_collapse_scan_wait(object, p); continue; } KASSERT(object->backing_object == backing_object, ("vm_object_collapse_scan: backing object mismatch %p != %p", object->backing_object, backing_object)); KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch %p != %p", p->object, backing_object)); if (p->pindex < backing_offset_index || new_pindex >= object->size) { vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } if (!vm_page_all_valid(p)) { KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_tryxbusy(pp) == 0) { vm_page_xunbusy(p); /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. */ next = vm_object_collapse_scan_wait(object, pp); continue; } if (pp != NULL && vm_page_none_valid(pp)) { /* * The page was invalid in the parent. Likely placed * there by an incomplete fault. Just remove and * ignore. p can replace it. */ if (vm_page_remove(pp)) vm_page_free(pp); pp = NULL; } if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); if (pp != NULL) vm_page_xunbusy(pp); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { vm_page_xunbusy(p); next = vm_object_collapse_scan_wait(object, NULL); continue; } /* Use the old pindex to free the right page. */ vm_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif vm_page_xunbusy(p); } return; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON, ("collapsing invalid object")); /* * Wait for the backing_object to finish any pending * collapse so that the caller sees the shortest possible * shadow chain. */ backing_object = vm_object_backing_collapse_wait(object); if (backing_object == NULL) return; KASSERT(object->ref_count > 0 && object->ref_count > atomic_load_int(&object->shadow_count), ("collapse with invalid ref %d or shadow %d count.", object->ref_count, atomic_load_int(&object->shadow_count))); KASSERT((backing_object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: Backing object already collapsing.")); KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: object is already collapsing.")); /* * We know that we can either collapse the backing object if * the parent is the only reference to it, or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. */ if (backing_object->ref_count == 1) { KASSERT(atomic_load_int(&backing_object->shadow_count) == 1, ("vm_object_collapse: shadow_count: %d", atomic_load_int(&backing_object->shadow_count))); vm_object_pip_add(object, 1); vm_object_set_flag(object, OBJ_COLLAPSING); vm_object_pip_add(backing_object, 1); vm_object_set_flag(backing_object, OBJ_DEAD); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if ((backing_object->flags & OBJ_SWAP) != 0) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy backing_object, it will change the * type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. */ vm_object_clear_flag(object, OBJ_COLLAPSING); vm_object_backing_transfer(object, backing_object); object->backing_object_offset += backing_object->backing_object_offset; VM_OBJECT_WUNLOCK(object); vm_object_pip_wakeup(object); /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); (void)refcount_release(&backing_object->ref_count); vm_object_terminate(backing_object); counter_u64_add(object_collapses, 1); VM_OBJECT_WLOCK(object); } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. * * The object lock and backing_object lock must not * be dropped during this sequence. */ if (!vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object != NULL) { vm_object_backing_insert_ref(object, new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ (void)refcount_release(&backing_object->ref_count); KASSERT(backing_object->ref_count >= 1, ( "backing_object %p was somehow dereferenced during collapse!", backing_object)); VM_OBJECT_WUNLOCK(backing_object); counter_u64_add(object_bypasses, 1); } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Skip invalid pages if asked to do so. Try to avoid acquiring * the busy lock, as some consumers rely on this to avoid * deadlocks. * * A thread may concurrently transition the page from invalid to * valid using only the busy lock, so the result of this check * is immediately stale. It is up to consumers to handle this, * for instance by ensuring that all invalid->valid transitions * happen with a mutex held, as may be possible for a * filesystem. */ if ((options & OBJPR_VALIDONLY) != 0 && vm_page_none_valid(p)) continue; /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ if (vm_page_tryxbusy(p) == 0) { if (vm_page_busy_sleep(p, "vmopar", 0)) VM_OBJECT_WLOCK(object); goto again; } if ((options & OBJPR_VALIDONLY) != 0 && vm_page_none_valid(p)) { vm_page_xunbusy(p); continue; } if (vm_page_wired(p)) { wired: if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { vm_page_invalid(p); vm_page_undirty(p); } vm_page_xunbusy(p); continue; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && !vm_page_none_valid(p)) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) goto wired; if (p->dirty != 0) { vm_page_xunbusy(p); continue; } } if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_all(p)) goto wired; vm_page_free(p); } vm_object_pip_wakeup(object); vm_pager_freespace(object, start, (end == 0 ? object->size : end) - start); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); vm_page_deactivate_noreuse(p); } } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL); if (rv != VM_PAGER_OK) break; /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); if ((prev_object->flags & OBJ_ANON) == 0) return (FALSE); VM_OBJECT_WLOCK(prev_object); /* * Try to collapse the object first. */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if (prev_object->ref_count > 1 && prev_object->size != next_pindex && (prev_object->flags & OBJ_ONEMAPPING) == 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty_(vm_object_t object) { atomic_add_int(&object->generation, 1); } bool vm_object_mightbedirty_(vm_object_t object) { return (object->generation != object->cleangeneration); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } if (vm_page_trysbusy(tm) == 0) { for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; if (tm->object != tobject) VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } tobject = tm->object; if (!vm_page_busy_sleep(tm, "unwbo", VM_ALLOC_IGN_SBUSY)) VM_OBJECT_RUNLOCK(tobject); goto again; } vm_page_unwire(tm, queue); vm_page_sunbusy(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } } /* * Return the vnode for the given object, or NULL if none exists. * For tmpfs objects, the function may return NULL if there is * no vnode allocated at the time of the call. */ struct vnode * vm_object_vnode(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); vm_pager_getvp(object, &vp, NULL); return (vp); } /* * Busy the vm object. This prevents new pages belonging to the object from * becoming busy. Existing pages persist as busy. Callers are responsible * for checking page state before proceeding. */ void vm_object_busy(vm_object_t obj) { VM_OBJECT_ASSERT_LOCKED(obj); blockcount_acquire(&obj->busy, 1); /* The fence is required to order loads of page busy. */ atomic_thread_fence_acq_rel(); } void vm_object_unbusy(vm_object_t obj) { blockcount_release(&obj->busy, 1); } void vm_object_busy_wait(vm_object_t obj, const char *wmesg) { VM_OBJECT_ASSERT_UNLOCKED(obj); (void)blockcount_sleep(&obj->busy, NULL, wmesg, PVM); } /* * This function aims to determine if the object is mapped, * specifically, if it is referenced by a vm_map_entry. Because * objects occasionally acquire transient references that do not * represent a mapping, the method used here is inexact. However, it * has very low overhead and is good enough for the advisory * vm.vmtotal sysctl. */ bool vm_object_is_active(vm_object_t obj) { return (obj->ref_count > atomic_load_int(&obj->shadow_count)); } static int vm_object_list_handler(struct sysctl_req *req, bool swap_only) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; u_long sp; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD || (swap_only && (obj->flags & (OBJ_ANON | OBJ_SWAP)) == 0)) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD || (swap_only && (obj->flags & (OBJ_ANON | OBJ_SWAP)) == 0)) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = atomic_load_int(&obj->shadow_count); kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; if (!swap_only) { TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->a.queue == PQ_ACTIVE) kvo->kvo_active++; else if (m->a.queue == PQ_INACTIVE) kvo->kvo_inactive++; } } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; vp = NULL; kvo->kvo_type = vm_object_kvme_type(obj, swap_only ? NULL : &vp); if (vp != NULL) { vref(vp); } else if ((obj->flags & OBJ_ANON) != 0) { MPASS(kvo->kvo_type == KVME_TYPE_DEFAULT || kvo->kvo_type == KVME_TYPE_SWAP); kvo->kvo_me = (uintptr_t)obj; /* tmpfs objs are reported as vnodes */ kvo->kvo_backing_obj = (uintptr_t)obj->backing_object; sp = swap_pager_swapped_pages(obj); kvo->kvo_swapped = sp > UINT32_MAX ? UINT32_MAX : sp; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); maybe_yield(); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { return (vm_object_list_handler(req, false)); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); static int sysctl_vm_object_list_swap(SYSCTL_HANDLER_ARGS) { return (vm_object_list_handler(req, true)); } /* * This sysctl returns list of the anonymous or swap objects. Intent * is to provide stripped optimized list useful to analyze swap use. * Since technically non-swap (default) objects participate in the * shadow chains, and are converted to swap type as needed by swap * pager, we must report them. */ SYSCTL_PROC(_vm, OID_AUTO, swap_objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list_swap, "S,kinfo_vmobject", "List of swap VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (map == 0) return 0; if (entry == 0) { VM_MAP_ENTRY_FOREACH(tmpe, map) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; VM_MAP_ENTRY_FOREACH(tmpe, tmpm) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if ((object->flags & OBJ_ANON) != 0) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } if (db_pager_quit) return; } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", atomic_load_int(&object->shadow_count), object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); if (db_pager_quit) break; } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (db_pager_quit) return; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (db_pager_quit) return; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (db_pager_quit) return; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (db_pager_quit) return; } } } #endif /* DDB */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 127406c0d582..cb7ce428db28 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1,5595 +1,5593 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vm_domain vm_dom[MAXMEMDOM]; DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; /* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; static SYSCTL_NODE(_vm_stats, OID_AUTO, page, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM page statistics"); static COUNTER_U64_DEFINE_EARLY(pqstate_commit_retries); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, pqstate_commit_retries, CTLFLAG_RD, &pqstate_commit_retries, "Number of failed per-page atomic queue state updates"); static COUNTER_U64_DEFINE_EARLY(queue_ops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_ops, CTLFLAG_RD, &queue_ops, "Number of batched queue operations"); static COUNTER_U64_DEFINE_EARLY(queue_nops); SYSCTL_COUNTER_U64(_vm_stats_page, OID_AUTO, queue_nops, CTLFLAG_RD, &queue_nops, "Number of batched queue operations with no effects"); /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; struct bitset *vm_page_dump; long vm_page_dump_pages; static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static bool vm_page_free_prep(vm_page_t m); static void vm_page_free_toq(vm_page_t m); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static void vm_page_mvqueue(vm_page_t m, const uint8_t queue, const uint16_t nflag); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, bool noreuse); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags); static void vm_page_zone_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); bogus_page = vm_page_alloc_noobj(VM_ALLOC_WIRED); } /* * The cache page zone is initialized later since we need to be able to allocate * pages before UMA is fully initialized. */ static void vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int cache, domain, maxcache, pool; maxcache = 0; TUNABLE_INT_FETCH("vm.pgcache_zone_max_pcpu", &maxcache); maxcache *= mp_ncpus; for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (pool = 0; pool < VM_NFREEPOOL; pool++) { pgcache = &vmd->vmd_pgcache[pool]; pgcache->domain = domain; pgcache->pool = pool; pgcache->zone = uma_zcache_create("vm pgcache", PAGE_SIZE, NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_VM); /* * Limit each pool's zone to 0.1% of the pages in the * domain. */ cache = maxcache != 0 ? maxcache : vmd->vmd_page_count / 1000; uma_zone_set_maxcache(pgcache->zone, cache); } } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose) { struct vm_domain *vmd; vm_page_t m; int ret; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) return (true); /* page does not exist, no failure */ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); vm_domain_free_unlock(vmd); if (ret != 0) { vm_domain_freecnt_inc(vmd, -1); TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (verbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } return (ret); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; char *next; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; vm_page_blacklist_add(pa, bootverbose); } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Initialize a dummy page for use in scans of the specified paging queue. * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies the page as a safety precaution. */ void vm_page_init_marker(vm_page_t marker, int queue, uint16_t aflags) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->a.flags = aflags; marker->busy_lock = VPB_CURTHREAD_EXCLUSIVE; marker->a.queue = queue; } static void vm_page_domain_init(int domain) { struct vm_domain *vmd; struct vm_pagequeue *pq; int i; vmd = VM_DOMAIN(domain); bzero(vmd, sizeof(*vmd)); *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(const char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); pq->pq_pdpages = 0; vm_page_init_marker(&vmd->vmd_markers[i], i, 0); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); /* * inacthead is used to provide FIFO ordering for LRU-bypassing * insertions. */ vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The clock pages are used to implement active queue scanning without * requeues. Scans start at clock[0], which is advanced after the scan * ends. When the two clock hands meet, they are reset and scanning * resumes from the head of the queue. */ vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[1], plinks.q); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->ref_count = 0; m->busy_lock = VPB_FREED; m->flags = m->a.flags = 0; m->phys_addr = pa; m->a.queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } #ifndef PMAP_HAS_PAGE_ARRAY static vm_paddr_t vm_page_array_alloc(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t page_range) { vm_paddr_t new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ *vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); vm_page_array = (vm_page_t)pmap_map(vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array_size = page_range; return (new_end); } #endif /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_phys_seg *seg; struct vm_domain *vmd; vm_page_t m; char *list, *listend; vm_paddr_t end, high_avail, low_avail, new_end, size; vm_paddr_t page_range __unused; vm_paddr_t last_pa, pa, startp, endp; u_long pagecount; #if MINIDUMP_PAGE_TRACKING u_long vm_page_dump_size; #endif int biggestone, i, segind; #ifdef WITNESS vm_offset_t mapped; int witness_size; #endif #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif vaddr = round_page(vaddr); vm_phys_early_startup(); biggestone = vm_phys_avail_largest(); end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(i); new_end = end; #ifdef WITNESS witness_size = round_page(witness_startup_count()); new_end -= witness_size; mapped = pmap_map(&vaddr, new_end, new_end + witness_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, witness_size); witness_startup((void *)mapped); #endif #if MINIDUMP_PAGE_TRACKING /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; vm_page_dump_pages = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) { vm_page_dump_pages += howmany(dump_avail[i + 1], PAGE_SIZE) - dump_avail[i] / PAGE_SIZE; if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; } vm_page_dump_size = round_page(BITSET_SIZE(vm_page_dump_pages)); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include the UMA bootstrap pages, witness pages and vm_page_dump * in a crash dump. When pmap_map() uses the direct map, they are * not automatically included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef PMAP_HAS_PAGE_ARRAY pmap_page_array_startup(size / PAGE_SIZE); biggestone = vm_phys_avail_largest(); end = new_end = phys_avail[biggestone + 1]; #else #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; new_end = vm_page_array_alloc(&vaddr, end, page_range); #endif #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ new_end = vm_reserv_startup(&vaddr, new_end); #endif #if defined(__aarch64__) || defined(__amd64__) || \ defined(__riscv) || defined(__powerpc64__) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) if (vm_phys_avail_size(i) != 0) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); m->flags = PG_FICTITIOUS; } #endif vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment's pages that are covered by one of * phys_avail's ranges to the free lists. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { if (seg->end <= phys_avail[i] || seg->start >= phys_avail[i + 1]) continue; startp = MAX(seg->start, phys_avail[i]); endp = MIN(seg->end, phys_avail[i + 1]); pagecount = (u_long)atop(endp - startp); if (pagecount == 0) continue; m = seg->first_page + atop(startp - seg->start); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << segind; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_trybusy * * Helper routine for grab functions to trylock busy. * * Returns true on success and false on failure. */ static bool vm_page_trybusy(vm_page_t m, int allocflags) { if ((allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0) return (vm_page_trysbusy(m)); else return (vm_page_tryxbusy(m)); } /* * vm_page_tryacquire * * Helper routine for grab functions to trylock busy and wire. * * Returns true on success and false on failure. */ static inline bool vm_page_tryacquire(vm_page_t m, int allocflags) { bool locked; locked = vm_page_trybusy(m, allocflags); if (locked && (allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); return (locked); } /* * vm_page_busy_acquire: * * Acquire the busy lock as described by VM_ALLOC_* flags. Will loop * and drop the object lock if necessary. */ bool vm_page_busy_acquire(vm_page_t m, int allocflags) { vm_object_t obj; bool locked; /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = atomic_load_ptr(&m->object); for (;;) { if (vm_page_tryacquire(m, allocflags)) return (true); if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); if (obj != NULL) locked = VM_OBJECT_WOWNED(obj); else locked = false; MPASS(locked || vm_page_wired(m)); if (_vm_page_busy_sleep(obj, m, m->pindex, "vmpba", allocflags, locked) && locked) VM_OBJECT_WLOCK(obj); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); KASSERT(m->object == obj || m->object == NULL, ("vm_page_busy_acquire: page %p does not belong to %p", m, obj)); } } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); x = vm_page_busy_fetch(m); for (;;) { if (atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_SHARERS_WORD(1))) break; } if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * * vm_page_busy_tryupgrade: * * Attempt to upgrade a single shared busy into an exclusive busy. */ int vm_page_busy_tryupgrade(vm_page_t m) { u_int ce, x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); ce = VPB_CURTHREAD_EXCLUSIVE; for (;;) { if (VPB_SHARERS(x) > 1) return (0); KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_busy_tryupgrade: invalid lock state")); if (!atomic_fcmpset_acq_int(&m->busy_lock, &x, ce | (x & VPB_BIT_WAITERS))) continue; return (1); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); x = vm_page_busy_fetch(m); for (;;) { KASSERT(x != VPB_FREED, ("vm_page_sunbusy: Unlocking freed page.")); if (VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) == 0) break; wakeup(m); break; } } /* * vm_page_busy_sleep: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of the busying mechanism. * * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function * will not sleep if the page is shared-busy. * * The object lock must be held on entry. * * Returns true if it slept and dropped the object lock, or false * if there was no sleep and the lock is still held. */ bool vm_page_busy_sleep(vm_page_t m, const char *wmesg, int allocflags) { vm_object_t obj; obj = m->object; VM_OBJECT_ASSERT_LOCKED(obj); return (_vm_page_busy_sleep(obj, m, m->pindex, wmesg, allocflags, true)); } /* * vm_page_busy_sleep_unlocked: * * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * If VM_ALLOC_IGN_SBUSY is specified in allocflags, the function * will not sleep if the page is shared-busy. * * The object lock must not be held on entry. The operation will * return if the page changes identity. */ void vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags) { VM_OBJECT_ASSERT_UNLOCKED(obj); (void)_vm_page_busy_sleep(obj, m, pindex, wmesg, allocflags, false); } /* * _vm_page_busy_sleep: * * Internal busy sleep function. Verifies the page identity and * lockstate against parameters. Returns true if it sleeps and * false otherwise. * * allocflags uses VM_ALLOC_* flags to specify the lock required. * * If locked is true the lock will be dropped for any true returns * and held for any false returns. */ static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { bool xsleep; u_int x; /* * If the object is busy we must wait for that to drain to zero * before trying the page again. */ if (obj != NULL && vm_object_busied(obj)) { if (locked) VM_OBJECT_DROP(obj); vm_object_busy_wait(obj, wmesg); return (true); } if (!vm_page_busied(m)) return (false); xsleep = (allocflags & (VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY)) != 0; sleepq_lock(m); x = vm_page_busy_fetch(m); do { /* * If the page changes objects or becomes unlocked we can * simply return. */ if (x == VPB_UNBUSIED || (xsleep && (x & VPB_BIT_SHARED) != 0) || m->object != obj || m->pindex != pindex) { sleepq_release(m); return (false); } if ((x & VPB_BIT_WAITERS) != 0) break; } while (!atomic_fcmpset_int(&m->busy_lock, &x, x | VPB_BIT_WAITERS)); if (locked) VM_OBJECT_DROP(obj); DROP_GIANT(); sleepq_add(m, NULL, wmesg, 0, 0); sleepq_wait(m, PVM); PICKUP_GIANT(); return (true); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { vm_object_t obj; u_int x; obj = m->object; x = vm_page_busy_fetch(m); for (;;) { if ((x & VPB_BIT_SHARED) == 0) return (0); /* * Reduce the window for transient busies that will trigger * false negatives in vm_page_ps_test(). */ if (obj != NULL && vm_object_busied(obj)) return (0); if (atomic_fcmpset_acq_int(&m->busy_lock, &x, x + VPB_ONE_SHARER)) break; } /* Refetch the object now that we're guaranteed that it is stable. */ obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_sunbusy(m); return (0); } return (1); } /* * vm_page_tryxbusy: * * Try to exclusive busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_tryxbusy(vm_page_t m) { vm_object_t obj; if (atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED, VPB_CURTHREAD_EXCLUSIVE) == 0) return (0); obj = m->object; if (obj != NULL && vm_object_busied(obj)) { vm_page_xunbusy(m); return (0); } return (1); } static void vm_page_xunbusy_hard_tail(vm_page_t m) { atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* Wake the waiter. */ wakeup(m); } /* * vm_page_xunbusy_hard: * * Called when unbusy has failed because there is a waiter. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_xunbusy_hard_tail(m); } void vm_page_xunbusy_hard_unchecked(vm_page_t m) { vm_page_assert_xbusied_unchecked(m); vm_page_xunbusy_hard_tail(m); } static void vm_page_busy_free(vm_page_t m) { u_int x; atomic_thread_fence_rel(); x = atomic_swap_int(&m->busy_lock, VPB_FREED); if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { for (; count != 0; count--) { vm_page_unwire(*ma, PQ_ACTIVE); ma++; } } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->a.queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; /* Fictitious pages are unevictable. */ m->ref_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); vm_page_assert_xbusied(m); vm_page_busy_free(m); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(!vm_page_none_valid(m), ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ if ((vm_page_busy_fetch(m) & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_xunbusy_unchecked(m); } /* * Destroy the identity of an invalid page and free it if possible. * This is intended to be used when reading a page from backing store fails. */ void vm_page_free_invalid(vm_page_t m) { KASSERT(vm_page_none_valid(m), ("page %p is valid", m)); KASSERT(!pmap_page_is_mapped(m), ("page %p is mapped", m)); KASSERT(m->object != NULL, ("page %p has no object", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); /* * We may be attempting to free the page as part of the handling for an * I/O error, in which case the page was xbusied by a different thread. */ vm_page_xbusy_claim(m); /* * If someone has wired this page while the object lock * was not held, then the thread that unwires is responsible * for freeing the page. Otherwise just free the page now. * The wire count of this unmapped page cannot change while * we have the page xbusy and the page's object wlocked. */ if (vm_page_remove(m)) vm_page_free(m); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page. */ m->object = object; m->pindex = pindex; m->ref_count |= VPRC_OBJREF; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; m->ref_count &= ~VPRC_OBJREF; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("vm_page_insert_radixdone: page %p is missing object ref", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_radixdone: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_radixdone: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's generation count. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * Do the work to remove a page from its object. The caller is responsible for * updating the page's fields to reflect this removal. */ static void vm_page_object_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem __diagused; vm_page_assert_xbusied(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("page %p is missing its object ref", m)); /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); m->object = NULL; mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. Returns true if the object's reference * was the last reference to the page, and false otherwise. * * The object must be locked and the page must be exclusively busied. * The exclusive busy will be released on return. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ bool vm_page_remove(vm_page_t m) { bool dropped; dropped = vm_page_remove_xbusy(m); vm_page_xunbusy(m); return (dropped); } /* * vm_page_remove_xbusy * * Removes the page but leaves the xbusy held. Returns true if this * removed the final ref and false otherwise. */ bool vm_page_remove_xbusy(vm_page_t m) { vm_page_object_remove(m); return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF); } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_lookup_unlocked: * * Returns the page associated with the object/offset pair specified; * if none is found, NULL is returned. The page may be no longer be * present in the object at the time that this function returns. Only * useful for opportunistic checks such as inmem(). */ vm_page_t vm_page_lookup_unlocked(vm_object_t object, vm_pindex_t pindex) { return (vm_radix_lookup_unlocked(&object->rtree, pindex)); } /* * vm_page_relookup: * * Returns a page that must already have been busied by * the caller. Used for bogus page replacement. */ vm_page_t vm_page_relookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; m = vm_radix_lookup_unlocked(&object->rtree, pindex); KASSERT(m != NULL && (vm_page_busied(m) || vm_page_wired(m)) && m->object == object && m->pindex == pindex, ("vm_page_relookup: Invalid page %p", m)); return (m); } /* * This should only be used by lockless functions for releasing transient * incorrect acquires. The page may have been freed after we acquired a * busy lock. In this case busy_lock == VPB_FREED and we have nothing * further to do. */ static void vm_page_busy_release(vm_page_t m) { u_int x; x = vm_page_busy_fetch(m); for (;;) { if (x == VPB_FREED) break; if ((x & VPB_BIT_SHARED) != 0 && VPB_SHARERS(x) > 1) { if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } KASSERT((x & VPB_BIT_SHARED) != 0 || (x & ~VPB_BIT_WAITERS) == VPB_CURTHREAD_EXCLUSIVE, ("vm_page_busy_release: %p xbusy not owned.", m)); if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); break; } } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * Both pages must be exclusively busied on enter. The old page is * unbusied on exit. * * A return value of true means mold is now free. If this is not the * final ref and the caller does not hold a wire reference it may not * continue to access the page. */ static bool vm_page_replace_hold(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_t mret __diagused; bool dropped; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_xbusied(mold); KASSERT(mnew->object == NULL && (mnew->ref_count & VPRC_OBJREF) == 0, ("vm_page_replace: page %p already in object", mnew)); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; atomic_set_int(&mnew->ref_count, VPRC_OBJREF); mret = vm_radix_replace(&object->rtree, mnew); KASSERT(mret == mold, ("invalid page replacement, mold=%p, mret=%p", mold, mret)); KASSERT((mold->oflags & VPO_UNMANAGED) == (mnew->oflags & VPO_UNMANAGED), ("vm_page_replace: mismatched VPO_UNMANAGED")); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; /* * The object's resident_page_count does not change because we have * swapped one page for another, but the generation count should * change if the page is dirty. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); dropped = vm_page_drop(mold, VPRC_OBJREF) == VPRC_OBJREF; vm_page_xunbusy(mold); return (dropped); } void vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex, vm_page_t mold) { vm_page_assert_xbusied(mnew); if (vm_page_replace_hold(mnew, object, pindex, mold)) vm_page_free(mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m)); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_object_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, vm_radix_lookup_le(&object->rtree, pindex))); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, vm_radix_lookup_le(&object->rtree, pindex))); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ static int _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) { u_int limit, old, new; if (req_class == VM_ALLOC_INTERRUPT) limit = 0; else if (req_class == VM_ALLOC_SYSTEM) limit = vmd->vmd_interrupt_free_min; else limit = vmd->vmd_free_reserved; /* * Attempt to reserve the pages. Fail if we're below the limit. */ limit += npages; old = vmd->vmd_free_count; do { if (old < limit) return (0); new = old - npages; } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); /* Wake the page daemon if we've crossed the threshold. */ if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) pagedaemon_wakeup(vmd->vmd_domain); /* Only update bitsets on transitions. */ if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) vm_domain_set(vmd); return (1); } int vm_domain_allocate(struct vm_domain *vmd, int req, int npages) { int req_class; /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; return (_vm_domain_allocate(vmd, req_class, npages)); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m; int flags; #define VPA_FLAGS (VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL | \ VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY | \ VM_ALLOC_SBUSY | VM_ALLOC_WIRED | \ VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK) KASSERT((req & ~VPA_FLAGS) == 0, ("invalid request %#x", req)); KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("invalid request %#x", req)); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); VM_OBJECT_ASSERT_WLOCKED(object); flags = 0; m = NULL; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the page from a reservation? */ if (vm_object_reserv(object) && (m = vm_reserv_alloc_page(object, pindex, domain, req, mpred)) != NULL) { goto found; } #endif vmd = VM_DOMAIN(domain); if (vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DEFAULT].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { /* * If not, allocate it from the free page queues. */ vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags |= m->flags & PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->a.flags = 0; m->oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); else m->busy_lock = VPB_UNBUSIED; if (req & VM_ALLOC_WIRED) { vm_wire_add(1); m->ref_count = 1; } m->a.act_count = 0; if (vm_page_insert_after(m, object, pindex, mpred)) { if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->ref_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } static vm_page_t vm_page_find_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_page_t m_ret; /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ vmd = VM_DOMAIN(domain); if (!vm_domain_allocate(vmd, req, npages)) return (NULL); /* * Try to allocate the pages from the free page queues. */ vm_domain_free_lock(vmd); m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); vm_domain_free_unlock(vmd); if (m_ret != NULL) return (m_ret); #if VM_NRESERVLEVEL > 0 /* * Try to break a reservation to allocate the pages. */ if ((req & VM_ALLOC_NORECLAIM) == 0) { m_ret = vm_reserv_reclaim_contig(domain, npages, low, high, alignment, boundary); if (m_ret != NULL) return (m_ret); } #endif vm_domain_freecnt_inc(vmd, npages); return (NULL); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; #define VPAC_FLAGS (VPA_FLAGS | VM_ALLOC_NORECLAIM) KASSERT((req & ~VPAC_FLAGS) == 0, ("invalid request %#x", req)); KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("invalid request %#x", req)); KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), ("invalid request %#x", req)); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); for (;;) { #if VM_NRESERVLEVEL > 0 /* * Can we allocate the pages from a reservation? */ if (vm_object_reserv(object) && (m_ret = vm_reserv_alloc_contig(object, pindex, domain, req, mpred, npages, low, high, alignment, boundary)) != NULL) { break; } #endif if ((m_ret = vm_page_find_contig_domain(domain, req, npages, low, high, alignment, boundary)) != NULL) break; if (!vm_domain_alloc_fail(VM_DOMAIN(domain), object, req)) return (NULL); } for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_CURTHREAD_EXCLUSIVE; else if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); else busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; for (m = m_ret; m < &m_ret[npages]; m++) { m->a.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; m->a.act_count = 0; m->oflags = oflags; if (vm_page_insert_after(m, object, pindex, mpred)) { if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->ref_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } return (m_ret); } /* * Allocate a physical page that is not intended to be inserted into a VM * object. If the "freelist" parameter is not equal to VM_NFREELIST, then only * pages from the specified vm_phys freelist will be returned. */ static __always_inline vm_page_t _vm_page_alloc_noobj_domain(int domain, const int freelist, int req) { struct vm_domain *vmd; vm_page_t m; int flags; #define VPAN_FLAGS (VM_ALLOC_CLASS_MASK | VM_ALLOC_WAITFAIL | \ VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | \ VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | \ VM_ALLOC_NODUMP | VM_ALLOC_ZERO | VM_ALLOC_COUNT_MASK) KASSERT((req & ~VPAN_FLAGS) == 0, ("invalid request %#x", req)); flags = (req & VM_ALLOC_NODUMP) != 0 ? PG_NODUMP : 0; vmd = VM_DOMAIN(domain); again: if (freelist == VM_NFREELIST && vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[VM_FREEPOOL_DIRECT].zone, M_NOWAIT | M_NOVM); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); if (freelist == VM_NFREELIST) m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DIRECT, 0); else m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (freelist == VM_NFREELIST && vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Consumers should not rely on a useful default pindex value. */ m->pindex = 0xdeadc0dedeadc0de; m->flags = (m->flags & PG_ZERO) | flags; m->a.flags = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) { vm_wire_add(1); m->ref_count = 1; } if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { KASSERT(freelist >= 0 && freelist < VM_NFREELIST, ("%s: invalid freelist %d", __func__, freelist)); return (_vm_page_alloc_noobj_domain(domain, freelist, req)); } vm_page_t vm_page_alloc_noobj(int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_noobj_domain(domain, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_noobj_domain(int domain, int req) { return (_vm_page_alloc_noobj_domain(domain, VM_NFREELIST, req)); } vm_page_t vm_page_alloc_noobj_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_noobj_contig_domain(domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_noobj_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m, m_ret; u_int flags; #define VPANC_FLAGS (VPAN_FLAGS | VM_ALLOC_NORECLAIM) KASSERT((req & ~VPANC_FLAGS) == 0, ("invalid request %#x", req)); KASSERT((req & (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM)) != (VM_ALLOC_WAITOK | VM_ALLOC_NORECLAIM), ("invalid request %#x", req)); KASSERT(((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("invalid request %#x", req)); KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); while ((m_ret = vm_page_find_contig_domain(domain, req, npages, low, high, alignment, boundary)) == NULL) { if (!vm_domain_alloc_fail(VM_DOMAIN(domain), NULL, req)) return (NULL); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); /* * Consumers should not rely on a useful default pindex value. */ m->pindex = 0xdeadc0dedeadc0de; m->a.flags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = VPB_UNBUSIED; if ((req & VM_ALLOC_WIRED) != 0) m->ref_count = 1; m->a.act_count = 0; m->oflags = VPO_UNMANAGED; /* * Zero the page before updating any mappings since the page is * not yet shared with any devices which might require the * non-default memory attribute. pmap_page_set_memattr() * flushes data caches before returning. */ if ((req & VM_ALLOC_ZERO) != 0 && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); } return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", m, m->a.queue, (m->a.flags & PGA_QUEUE_STATE_MASK))); KASSERT(m->ref_count == 0, ("page %p has references", m)); KASSERT(vm_page_busy_freed(m), ("page %p is not freed", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); pmap_vm_page_alloc_check(m); } static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); /* * The page daemon should avoid creating extra memory pressure since its * main purpose is to replenish the store of free pages. */ if (vmd->vmd_severeset || curproc == pageproc || !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, (vm_page_t *)store); vm_domain_free_unlock(vmd); if (cnt != i) vm_domain_freecnt_inc(vmd, cnt - i); return (i); } static void vm_page_zone_release(void *arg, void **store, int cnt) { struct vm_domain *vmd; struct vm_pgcache *pgcache; vm_page_t m; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); vm_domain_free_lock(vmd); for (i = 0; i < cnt; i++) { m = (vm_page_t)store[i]; vm_phys_free_pages(m, 0); } vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->ref_count >= 1, ("fictitious page %p has invalid ref count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if (!vm_addr_align_ok(pa, alignment)) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (!vm_addr_bound_ok(pa, ptoa(npages), boundary)) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); retry: m_inc = 1; if (vm_page_wired(m)) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ VM_OBJECT_RLOCK(object); if (object != m->object) { VM_OBJECT_RUNLOCK(object); goto retry; } /* Don't care: PG_NODUMP, PG_ZERO. */ - if (object->type != OBJT_DEFAULT && - (object->flags & OBJ_SWAP) == 0 && + if ((object->flags & OBJ_SWAP) == 0 && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ run_ext = 1; } else run_ext = 0; VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page busy lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct vm_domain *vmd; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Racily check for wirings. Races are handled once the object * lock is held and the page is unmapped. */ if (vm_page_wired(m)) error = EBUSY; else if ((object = atomic_load_ptr(&m->object)) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ VM_OBJECT_WLOCK(object); /* Don't care: PG_NODUMP, PG_ZERO. */ if (m->object != object || - (object->type != OBJT_DEFAULT && - (object->flags & OBJ_SWAP) == 0 && + ((object->flags & OBJ_SWAP) == 0 && object->type != OBJT_VNODE)) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (vm_page_queue(m) != PQ_NONE && vm_page_tryxbusy(m) != 0) { if (vm_page_wired(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock; } KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT(m->oflags == 0, ("page %p has unexpected oflags", m)); /* Don't care: PGA_NOSYNC. */ if (!vm_page_none_valid(m)) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_noobj_contig( req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_noobj_contig( req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_noobj_contig( req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { vm_page_xunbusy(m); error = ENOMEM; goto unlock; } /* * Unmap the page and check for new * wirings that may have been acquired * through a pmap lookup. */ if (object->ref_count != 0 && !vm_page_try_remove_all(m)) { vm_page_xunbusy(m); vm_page_free(m_new); error = EBUSY; goto unlock; } /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ m_new->a.flags = m->a.flags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = 0; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_replace_hold(m_new, object, m->pindex, m) && vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_page_domain(m) == domain); vmd = VM_DOMAIN(domain); vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } } if ((m = SLIST_FIRST(&free)) != NULL) { int cnt; vmd = VM_DOMAIN(domain); cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_page_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_free_pages(m, 0); cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform vm_wait() before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, minalign, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); /* * The caller will attempt an allocation after some runs have been * reclaimed and added to the vm_phys buddy lists. Due to limitations * of vm_phys_alloc_contig(), round up the requested length to the next * power of two or maximum chunk size, and ensure that each run is * suitably aligned. */ minalign = 1ul << imin(flsl(npages - 1), VM_NFREEORDER - 1); npages = roundup2(npages, minalign); if (alignment < ptoa(minalign)) alignment = ptoa(minalign); /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (ret); } /* * Set the domain in the appropriate page level domainset. */ void vm_domain_set(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (!vmd->vmd_minset && vm_paging_min(vmd)) { vmd->vmd_minset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); } mtx_unlock(&vm_domainset_lock); } /* * Clear the domain from the appropriate page level domainset. */ void vm_domain_clear(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_minset && !vm_paging_min(vmd)) { vmd->vmd_minset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); if (vm_min_waiters != 0) { vm_min_waiters = 0; wakeup(&vm_min_domains); } } if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { vmd->vmd_severeset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); if (vm_severe_waiters != 0) { vm_severe_waiters = 0; wakeup(&vm_severe_domains); } } /* * If pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* See comments in vm_wait_doms(). */ if (vm_pageproc_waiters) { vm_pageproc_waiters = 0; wakeup(&vm_pageproc_waiters); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the min threshold globally. */ void vm_wait_min(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the severe threshold globally. */ void vm_wait_severe(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_severe()) { vm_severe_waiters++; msleep(&vm_severe_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } u_int vm_wait_count(void) { return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } int vm_wait_doms(const domainset_t *wdoms, int mflags) { int error; error = 0; /* * We use racey wakeup synchronization to avoid expensive global * locking for the pageproc when sleeping with a non-specific vm_wait. * To handle this, we only sleep for one tick in this instance. It * is expected that most allocations for the pageproc will come from * kmem or vm_page_grab* which will use the more specific and * race-free vm_wait_domain(). */ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; error = msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP | mflags, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { if (pageproc == NULL) panic("vm_wait in early boot"); vm_min_waiters++; error = msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP | mflags, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } return (error); } /* * vm_wait_domain: * * Sleep until free pages are available for allocation. * - Called in various places after failed memory allocations. */ void vm_wait_domain(int domain) { struct vm_domain *vmd; domainset_t wdom; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); } else mtx_unlock(&vm_domainset_lock); } else { DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); vm_wait_doms(&wdom, 0); } } static int vm_wait_flags(vm_object_t obj, int mflags) { struct domainset *d; d = NULL; /* * Carefully fetch pointers only once: the struct domainset * itself is ummutable but the pointer might change. */ if (obj != NULL) d = obj->domain.dr_policy; if (d == NULL) d = curthread->td_domain.dr_policy; return (vm_wait_doms(&d->ds_mask, mflags)); } /* * vm_wait: * * Sleep until free pages are available for allocation in the * affinity domains of the obj. If obj is NULL, the domain set * for the calling thread is used. * Called in various places after failed memory allocations. */ void vm_wait(vm_object_t obj) { (void)vm_wait_flags(obj, 0); } int vm_wait_intr(vm_object_t obj) { return (vm_wait_flags(obj, PCATCH)); } /* * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } return (0); } /* * vm_waitpfault: * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(struct domainset *dset, int timo) { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", timo); } else mtx_unlock(&vm_domainset_lock); } static struct vm_pagequeue * _vm_page_pagequeue(vm_page_t m, uint8_t queue) { return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } #ifdef INVARIANTS static struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (_vm_page_pagequeue(m, vm_page_astate_load(m).queue)); } #endif static __always_inline bool vm_page_pqstate_fcmpset(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_astate_t tmp; tmp = *old; do { if (__predict_true(vm_page_astate_fcmpset(m, old, new))) return (true); counter_u64_add(pqstate_commit_retries, 1); } while (old->_bits == tmp._bits); return (false); } /* * Do the work of committing a queue state update that moves the page out of * its current queue. */ static bool _vm_page_pqstate_commit_dequeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { vm_page_t next; vm_pagequeue_assert_locked(pq); KASSERT(vm_page_pagequeue(m) == pq, ("%s: queue %p does not match page %p", __func__, pq, m)); KASSERT(old->queue != PQ_NONE && new.queue != old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); /* * Once the queue index of the page changes there is nothing * synchronizing with further updates to the page's physical * queue state. Therefore we must speculatively remove the page * from the queue now and be prepared to roll back if the queue * state update fails. If the page is not physically enqueued then * we just update its queue index. */ if ((old->flags & PGA_ENQUEUED) != 0) { new.flags &= ~PGA_ENQUEUED; next = TAILQ_NEXT(m, plinks.q); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); if (!vm_page_pqstate_fcmpset(m, old, new)) { if (next == NULL) TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); else TAILQ_INSERT_BEFORE(next, m, plinks.q); vm_pagequeue_cnt_inc(pq); return (false); } else { return (true); } } else { return (vm_page_pqstate_fcmpset(m, old, new)); } } static bool vm_page_pqstate_commit_dequeue(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_pagequeue *pq; vm_page_astate_t as; bool ret; pq = _vm_page_pagequeue(m, old->queue); /* * The queue field and PGA_ENQUEUED flag are stable only so long as the * corresponding page queue lock is held. */ vm_pagequeue_lock(pq); as = vm_page_astate_load(m); if (__predict_false(as._bits != old->_bits)) { *old = as; ret = false; } else { ret = _vm_page_pqstate_commit_dequeue(pq, m, old, new); } vm_pagequeue_unlock(pq); return (ret); } /* * Commit a queue state update that enqueues or requeues a page. */ static bool _vm_page_pqstate_commit_requeue(struct vm_pagequeue *pq, vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { struct vm_domain *vmd; vm_pagequeue_assert_locked(pq); KASSERT(old->queue != PQ_NONE && new.queue == old->queue, ("%s: invalid queue indices %d %d", __func__, old->queue, new.queue)); new.flags |= PGA_ENQUEUED; if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if ((old->flags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); else vm_pagequeue_cnt_inc(pq); /* * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. In particular, if * both flags are set in close succession, only PGA_REQUEUE_HEAD will be * applied, even if it was set first. */ if ((old->flags & PGA_REQUEUE_HEAD) != 0) { vmd = vm_pagequeue_domain(m); KASSERT(pq == &vmd->vmd_pagequeues[PQ_INACTIVE], ("%s: invalid page queue for page %p", __func__, m)); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); } else { TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } return (true); } /* * Commit a queue state update that encodes a request for a deferred queue * operation. */ static bool vm_page_pqstate_commit_request(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { KASSERT(old->queue == new.queue || new.queue != PQ_NONE, ("%s: invalid state, queue %d flags %x", __func__, new.queue, new.flags)); if (old->_bits != new._bits && !vm_page_pqstate_fcmpset(m, old, new)) return (false); vm_page_pqbatch_submit(m, new.queue); return (true); } /* * A generic queue state update function. This handles more cases than the * specialized functions above. */ bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old, vm_page_astate_t new) { if (old->_bits == new._bits) return (true); if (old->queue != PQ_NONE && new.queue != old->queue) { if (!vm_page_pqstate_commit_dequeue(m, old, new)) return (false); if (new.queue != PQ_NONE) vm_page_pqbatch_submit(m, new.queue); } else { if (!vm_page_pqstate_fcmpset(m, old, new)) return (false); if (new.queue != PQ_NONE && ((new.flags & ~old->flags) & PGA_QUEUE_OP_MASK) != 0) vm_page_pqbatch_submit(m, new.queue); } return (true); } /* * Apply deferred queue state updates to a page. */ static inline void vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m, uint8_t queue) { vm_page_astate_t new, old; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); KASSERT(queue < PQ_COUNT, ("%s: invalid queue index %d", __func__, queue)); KASSERT(pq == _vm_page_pagequeue(m, queue), ("%s: page %p does not belong to queue %p", __func__, m, pq)); for (old = vm_page_astate_load(m);;) { if (__predict_false(old.queue != queue || (old.flags & PGA_QUEUE_OP_MASK) == 0)) { counter_u64_add(queue_nops, 1); break; } KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); new = old; if ((old.flags & PGA_DEQUEUE) != 0) { new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; if (__predict_true(_vm_page_pqstate_commit_dequeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } else { new.flags &= ~(PGA_REQUEUE | PGA_REQUEUE_HEAD); if (__predict_true(_vm_page_pqstate_commit_requeue(pq, m, &old, new))) { counter_u64_add(queue_ops, 1); break; } } } } static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { int i; for (i = 0; i < bq->bq_cnt; i++) vm_pqbatch_process_page(pq, bq->bq_pa[i], queue); vm_batchqueue_init(bq); } /* * vm_page_pqbatch_submit: [ internal use only ] * * Enqueue a page in the specified page queue's batched work queue. * The caller must have encoded the requested operation in the page * structure's a.flags field. */ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue) { struct vm_batchqueue *bq; struct vm_pagequeue *pq; int domain; KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_page_domain(m); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); if (vm_batchqueue_insert(bq, m)) { critical_exit(); return; } critical_exit(); pq = &VM_DOMAIN(domain)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); vm_pqbatch_process(pq, bq, queue); vm_pqbatch_process_page(pq, m, queue); vm_pagequeue_unlock(pq); critical_exit(); } /* * vm_page_pqbatch_drain: [ internal use only ] * * Force all per-CPU page queue batch queues to be drained. This is * intended for use in severe memory shortages, to ensure that pages * do not remain stuck in the batch queues. */ void vm_page_pqbatch_drain(void) { struct thread *td; struct vm_domain *vmd; struct vm_pagequeue *pq; int cpu, domain, queue; td = curthread; CPU_FOREACH(cpu) { thread_lock(td); sched_bind(td, cpu); thread_unlock(td); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (queue = 0; queue < PQ_COUNT; queue++) { pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); vm_pqbatch_process(pq, DPCPU_PTR(pqbatch[domain][queue]), queue); critical_exit(); vm_pagequeue_unlock(pq); } } } thread_lock(td); sched_unbind(td); thread_unlock(td); } /* * vm_page_dequeue_deferred: [ internal use only ] * * Request removal of the given page from its current page * queue. Physical removal from the queue may be deferred * indefinitely. */ void vm_page_dequeue_deferred(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags |= PGA_DEQUEUE; } while (!vm_page_pqstate_commit_request(m, &old, new)); } /* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any, before * returning. */ void vm_page_dequeue(vm_page_t m) { vm_page_astate_t new, old; old = vm_page_astate_load(m); do { if (old.queue == PQ_NONE) { KASSERT((old.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p has unexpected queue state", __func__, m)); break; } new = old; new.flags &= ~PGA_QUEUE_OP_MASK; new.queue = PQ_NONE; } while (!vm_page_pqstate_commit_dequeue(m, &old, new)); } /* * Schedule the given page for insertion into the specified page queue. * Physical insertion of the page may be deferred indefinitely. */ static void vm_page_enqueue(vm_page_t m, uint8_t queue) { KASSERT(m->a.queue == PQ_NONE && (m->a.flags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); m->a.queue = queue; if ((m->a.flags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_page_pqbatch_submit(m, queue); } /* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object, if it exists, must be locked, and then the page must * be xbusy. Otherwise the page must be not busied. A managed * page must be unmapped. */ static bool vm_page_free_prep(vm_page_t m) { /* * Synchronize with threads that have dropped a reference to this * page. */ atomic_thread_fence_acq(); #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); KASSERT((m->a.flags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, ("vm_page_free_prep: mapping flags set in page %p", m)); } else { KASSERT(m->a.queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); } VM_CNT_INC(v_tfree); if (m->object != NULL) { KASSERT(((m->oflags & VPO_UNMANAGED) != 0) == ((m->object->flags & OBJ_UNMANAGED) != 0), ("vm_page_free_prep: managed flag mismatch for page %p", m)); vm_page_assert_xbusied(m); /* * The object reference can be released without an atomic * operation. */ KASSERT((m->flags & PG_FICTITIOUS) != 0 || m->ref_count == VPRC_OBJREF, ("vm_page_free_prep: page %p has unexpected ref_count %u", m, m->ref_count)); vm_page_object_remove(m); m->ref_count -= VPRC_OBJREF; } else vm_page_assert_unbusied(m); vm_page_busy_free(m); /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->ref_count == 1, ("fictitious page %p is referenced", m)); KASSERT(m->a.queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } /* * Pages need not be dequeued before they are returned to the physical * memory allocator, but they must at least be marked for a deferred * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_dequeue_deferred(m); m->valid = 0; vm_page_undirty(m); if (m->ref_count != 0) panic("vm_page_free_prep: page %p has references", m); /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); #if VM_NRESERVLEVEL > 0 /* * Determine whether the page belongs to a reservation. If the page was * allocated from a per-CPU cache, it cannot belong to a reservation, so * as an optimization, we avoid the check in that case. */ if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) return (false); #endif return (true); } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be exclusively busied if it * belongs to an object. */ static void vm_page_free_toq(vm_page_t m) { struct vm_domain *vmd; uma_zone_t zone; if (!vm_page_free_prep(m)) return; vmd = vm_pagequeue_domain(m); zone = vmd->vmd_pgcache[m->pool].zone; if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { uma_zfree(zone, m); return; } vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); } /* * vm_page_free_pages_toq: * * Returns a list of pages to the free list, disassociating it * from any VM object. In other words, this is equivalent to * calling vm_page_free_toq() for each page of a list of VM objects. */ void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; int count; if (SLIST_EMPTY(free)) return; count = 0; while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); vm_page_free_toq(m); } if (update_wire_count) vm_wire_sub(count); } /* * Mark this page as wired down. For managed pages, this prevents reclamation * by the page daemon, or when the containing object, if any, is destroyed. */ void vm_page_wire(vm_page_t m) { u_int old; #ifdef INVARIANTS if (m->object != NULL && !vm_page_busied(m) && !vm_object_busied(m->object)) VM_OBJECT_ASSERT_LOCKED(m->object); #endif KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(m->ref_count) >= 1, ("vm_page_wire: fictitious page %p has zero wirings", m)); old = atomic_fetchadd_int(&m->ref_count, 1); KASSERT(VPRC_WIRE_COUNT(old) != VPRC_WIRE_COUNT_MAX, ("vm_page_wire: counter overflow for page %p", m)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } } /* * Attempt to wire a mapped page following a pmap lookup of that page. * This may fail if a thread is concurrently tearing down mappings of the page. * The transient failure is acceptable because it translates to the * failure of the caller pmap_extract_and_hold(), which should be then * followed by the vm_fault() fallback, see e.g. vm_fault_quick_hold_pages(). */ bool vm_page_wire_mapped(vm_page_t m) { u_int old; old = m->ref_count; do { KASSERT(old > 0, ("vm_page_wire_mapped: wiring unreferenced page %p", m)); if ((old & VPRC_BLOCKED) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old + 1)); if (VPRC_WIRE_COUNT(old) == 0) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); vm_wire_add(1); } return (true); } /* * Release a wiring reference to a managed page. If the page still belongs to * an object, update its position in the page queues to reflect the reference. * If the wiring was the last reference to the page, free the page. */ static void vm_page_unwire_managed(vm_page_t m, uint8_t nqueue, bool noreuse) { u_int old; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is unmanaged", __func__, m)); /* * Update LRU state before releasing the wiring reference. * Use a release store when updating the reference count to * synchronize with vm_page_free_prep(). */ old = m->ref_count; do { KASSERT(VPRC_WIRE_COUNT(old) > 0, ("vm_page_unwire: wire count underflow for page %p", m)); if (old > VPRC_OBJREF + 1) { /* * The page has at least one other wiring reference. An * earlier iteration of this loop may have called * vm_page_release_toq() and cleared PGA_DEQUEUE, so * re-set it if necessary. */ if ((vm_page_astate_load(m).flags & PGA_DEQUEUE) == 0) vm_page_aflag_set(m, PGA_DEQUEUE); } else if (old == VPRC_OBJREF + 1) { /* * This is the last wiring. Clear PGA_DEQUEUE and * update the page's queue state to reflect the * reference. If the page does not belong to an object * (i.e., the VPRC_OBJREF bit is clear), we only need to * clear leftover queue state. */ vm_page_release_toq(m, nqueue, noreuse); } else if (old == 1) { vm_page_aflag_clear(m, PGA_DEQUEUE); } } while (!atomic_fcmpset_rel_int(&m->ref_count, &old, old - 1)); if (VPRC_WIRE_COUNT(old) == 1) { vm_wire_sub(1); if (old == 1) vm_page_free(m); } } /* * Release one wiring of the specified page, potentially allowing it to be * paged out. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue. If the released wiring * represented the last reference to the page, the page is freed. */ void vm_page_unwire(vm_page_t m, uint8_t nqueue) { KASSERT(nqueue < PQ_COUNT, ("vm_page_unwire: invalid queue %u request for page %p", nqueue, m)); if ((m->oflags & VPO_UNMANAGED) != 0) { if (vm_page_unwire_noq(m) && m->ref_count == 0) vm_page_free(m); return; } vm_page_unwire_managed(m, nqueue, false); } /* * Unwire a page without (re-)inserting it into a page queue. It is up * to the caller to enqueue, requeue, or free the page as appropriate. * In most cases involving managed pages, vm_page_unwire() should be used * instead. */ bool vm_page_unwire_noq(vm_page_t m) { u_int old; old = vm_page_drop(m, 1); KASSERT(VPRC_WIRE_COUNT(old) != 0, ("%s: counter underflow for page %p", __func__, m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || VPRC_WIRE_COUNT(old) > 1, ("%s: missing ref on fictitious page %p", __func__, m)); if (VPRC_WIRE_COUNT(old) > 1) return (false); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_aflag_clear(m, PGA_DEQUEUE); vm_wire_sub(1); return (true); } /* * Ensure that the page ends up in the specified page queue. If the page is * active or being moved to the active queue, ensure that its act_count is * at least ACT_INIT but do not otherwise mess with it. */ static __always_inline void vm_page_mvqueue(vm_page_t m, const uint8_t nqueue, const uint16_t nflag) { vm_page_astate_t old, new; KASSERT(m->ref_count > 0, ("%s: page %p does not carry any references", __func__, m)); KASSERT(nflag == PGA_REQUEUE || nflag == PGA_REQUEUE_HEAD, ("%s: invalid flags %x", __func__, nflag)); if ((m->oflags & VPO_UNMANAGED) != 0 || vm_page_wired(m)) return; old = vm_page_astate_load(m); do { if ((old.flags & PGA_DEQUEUE) != 0) break; new = old; new.flags &= ~PGA_QUEUE_OP_MASK; if (nqueue == PQ_ACTIVE) new.act_count = max(old.act_count, ACT_INIT); if (old.queue == nqueue) { if (nqueue != PQ_ACTIVE) new.flags |= nflag; } else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Put the specified page on the active list (if appropriate). */ void vm_page_activate(vm_page_t m) { vm_page_mvqueue(m, PQ_ACTIVE, PGA_REQUEUE); } /* * Move the specified page to the tail of the inactive queue, or requeue * the page if it is already in the inactive queue. */ void vm_page_deactivate(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE); } void vm_page_deactivate_noreuse(vm_page_t m) { vm_page_mvqueue(m, PQ_INACTIVE, PGA_REQUEUE_HEAD); } /* * Put a page in the laundry, or requeue it if it is already there. */ void vm_page_launder(vm_page_t m) { vm_page_mvqueue(m, PQ_LAUNDRY, PGA_REQUEUE); } /* * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); vm_page_dequeue(m); vm_page_enqueue(m, PQ_UNSWAPPABLE); } /* * Release a page back to the page queues in preparation for unwiring. */ static void vm_page_release_toq(vm_page_t m, uint8_t nqueue, const bool noreuse) { vm_page_astate_t old, new; uint16_t nflag; /* * Use a check of the valid bits to determine whether we should * accelerate reclamation of the page. The object lock might not be * held here, in which case the check is racy. At worst we will either * accelerate reclamation of a valid page and violate LRU, or * unnecessarily defer reclamation of an invalid page. * * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ if (noreuse || m->valid == 0) { nqueue = PQ_INACTIVE; nflag = PGA_REQUEUE_HEAD; } else { nflag = PGA_REQUEUE; } old = vm_page_astate_load(m); do { new = old; /* * If the page is already in the active queue and we are not * trying to accelerate reclamation, simply mark it as * referenced and avoid any queue operations. */ new.flags &= ~PGA_QUEUE_OP_MASK; if (nflag != PGA_REQUEUE_HEAD && old.queue == PQ_ACTIVE) new.flags |= PGA_REFERENCED; else { new.flags |= nflag; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); } /* * Unwire a page and either attempt to free it or re-add it to the page queues. */ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); if ((flags & VPR_TRYFREE) != 0) { for (;;) { object = atomic_load_ptr(&m->object); if (object == NULL) break; /* Depends on type-stability. */ if (vm_page_busied(m) || !VM_OBJECT_TRYWLOCK(object)) break; if (object == m->object) { vm_page_release_locked(m, flags); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_WUNLOCK(object); } } vm_page_unwire_managed(m, PQ_INACTIVE, flags != 0); } /* See vm_page_release(). */ void vm_page_release_locked(vm_page_t m, int flags) { VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release_locked: page %p is unmanaged", m)); if (vm_page_unwire_noq(m)) { if ((flags & VPR_TRYFREE) != 0 && (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && vm_page_tryxbusy(m)) { /* * An unlocked lookup may have wired the page before the * busy lock was acquired, in which case the page must * not be freed. */ if (__predict_true(!vm_page_wired(m))) { vm_page_free(m); return; } vm_page_xunbusy(m); } else { vm_page_release_toq(m, PQ_INACTIVE, flags != 0); } } } static bool vm_page_try_blocked_op(vm_page_t m, void (*op)(vm_page_t)) { u_int old; KASSERT(m->object != NULL && (m->oflags & VPO_UNMANAGED) == 0, ("vm_page_try_blocked_op: page %p has no object", m)); KASSERT(vm_page_busied(m), ("vm_page_try_blocked_op: page %p is not busy", m)); VM_OBJECT_ASSERT_LOCKED(m->object); old = m->ref_count; do { KASSERT(old != 0, ("vm_page_try_blocked_op: page %p has no references", m)); if (VPRC_WIRE_COUNT(old) != 0) return (false); } while (!atomic_fcmpset_int(&m->ref_count, &old, old | VPRC_BLOCKED)); (op)(m); /* * If the object is read-locked, new wirings may be created via an * object lookup. */ old = vm_page_drop(m, VPRC_BLOCKED); KASSERT(!VM_OBJECT_WOWNED(m->object) || old == (VPRC_BLOCKED | VPRC_OBJREF), ("vm_page_try_blocked_op: unexpected refcount value %u for %p", old, m)); return (true); } /* * Atomically check for wirings and remove all mappings of the page. */ bool vm_page_try_remove_all(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_all)); } /* * Atomically check for wirings and remove all writeable mappings of the page. */ bool vm_page_try_remove_write(vm_page_t m) { return (vm_page_try_blocked_op(m, pmap_remove_write)); } /* * vm_page_advise * * Apply the specified advice to the given page. */ void vm_page_advise(vm_page_t m, int advice) { VM_OBJECT_ASSERT_WLOCKED(m->object); vm_page_assert_xbusied(m); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else if (!vm_page_in_laundry(m)) vm_page_launder(m); } /* * vm_page_grab_release * * Helper routine for grab functions to release busy on return. */ static inline void vm_page_grab_release(vm_page_t m, int allocflags) { if ((allocflags & VM_ALLOC_NOBUSY) != 0) { if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) vm_page_sunbusy(m); else vm_page_xunbusy(m); } } /* * vm_page_grab_sleep * * Sleep for busy according to VM_ALLOC_ parameters. Returns true * if the caller should retry and false otherwise. * * If the object is locked on entry the object will be unlocked with * false returns and still locked but possibly having been dropped * with true returns. */ static bool vm_page_grab_sleep(vm_object_t object, vm_page_t m, vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (false); /* * Reference the page before unlocking and sleeping so that * the page daemon is less likely to reclaim it. */ if (locked && (allocflags & VM_ALLOC_NOCREAT) == 0) vm_page_reference(m); if (_vm_page_busy_sleep(object, m, pindex, wmesg, allocflags, locked) && locked) VM_OBJECT_WLOCK(object); if ((allocflags & VM_ALLOC_WAITFAIL) != 0) return (false); return (true); } /* * Assert that the grab flags are valid. */ static inline void vm_page_grab_check(int allocflags) { KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab*: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab*: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); } /* * Calculate the page allocation flags for grab. */ static inline int vm_page_grab_pflags(int allocflags) { int pflags; pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; if ((allocflags & VM_ALLOC_IGN_SBUSY) != 0) pflags |= VM_ALLOC_SBUSY; return (pflags); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_grab_check(allocflags); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true)) goto retrylookup; return (NULL); } goto out; } if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); m = vm_page_alloc(object, pindex, vm_page_grab_pflags(allocflags)); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); out: vm_page_grab_release(m, allocflags); return (m); } /* * Locklessly attempt to acquire a page given a (object, pindex) tuple * and an optional previous page to avoid the radix lookup. The resulting * page will be validated against the identity tuple and busied or wired * as requested. A NULL *mp return guarantees that the page was not in * radix at the time of the call but callers must perform higher level * synchronization or retry the operation under a lock if they require * an atomic answer. This is the only lock free validation routine, * other routines can depend on the resulting page state. * * The return value indicates whether the operation failed due to caller * flags. The return is tri-state with mp: * * (true, *mp != NULL) - The operation was successful. * (true, *mp == NULL) - The page was not found in tree. * (false, *mp == NULL) - WAITFAIL or NOWAIT prevented acquisition. */ static bool vm_page_acquire_unlocked(vm_object_t object, vm_pindex_t pindex, vm_page_t prev, vm_page_t *mp, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); MPASS(prev == NULL || vm_page_busied(prev) || vm_page_wired(prev)); *mp = NULL; for (;;) { /* * We may see a false NULL here because the previous page * has been removed or just inserted and the list is loaded * without barriers. Switch to radix to verify. */ if (prev == NULL || (m = TAILQ_NEXT(prev, listq)) == NULL || QMD_IS_TRASHED(m) || m->pindex != pindex || atomic_load_ptr(&m->object) != object) { prev = NULL; /* * This guarantees the result is instantaneously * correct. */ m = vm_radix_lookup_unlocked(&object->rtree, pindex); } if (m == NULL) return (true); if (vm_page_trybusy(m, allocflags)) { if (m->object == object && m->pindex == pindex) break; /* relookup. */ vm_page_busy_release(m); cpu_spinwait(); continue; } if (!vm_page_grab_sleep(object, m, pindex, "pgnslp", allocflags, false)) return (false); } if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); *mp = m; return (true); } /* * Try to locklessly grab a page and fall back to the object lock if NOCREAT * is not set. */ vm_page_t vm_page_grab_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_grab_check(allocflags); if (!vm_page_acquire_unlocked(object, pindex, NULL, &m, allocflags)) return (NULL); if (m != NULL) return (m); /* * The radix lockless lookup should never return a false negative * errors. If the user specifies NOCREAT they are guaranteed there * was no page present at the instant of the call. A NOCREAT caller * must handle create races gracefully. */ if ((allocflags & VM_ALLOC_NOCREAT) != 0) return (NULL); VM_OBJECT_WLOCK(object); m = vm_page_grab(object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (m); } /* * Grab a page and make it valid, paging in if necessary. Pages missing from * their pager are zero filled and validated. If a VM_ALLOC_COUNT is supplied * and the page is not valid as many as VM_INITIAL_PAGEIN pages can be brought * in simultaneously. Additional pages will be left on a paging queue but * will neither be wired nor busy regardless of allocflags. */ int vm_page_grab_valid(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; vm_page_t ma[VM_INITIAL_PAGEIN]; int after, i, pflags, rv; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid: Invalid flags 0x%X", allocflags)); VM_OBJECT_ASSERT_WLOCKED(object); pflags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY); pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { /* * If the page is fully valid it can only become invalid * with the object lock held. If it is not valid it can * become valid with the busy lock held. Therefore, we * may unnecessarily lock the exclusive busy here if we * race with I/O completion not using the object lock. * However, we will not end up with an invalid page and a * shared lock. */ if (!vm_page_trybusy(m, vm_page_all_valid(m) ? allocflags : 0)) { (void)vm_page_grab_sleep(object, m, pindex, "pgrbwt", allocflags, true); goto retrylookup; } if (vm_page_all_valid(m)) goto out; if ((allocflags & VM_ALLOC_NOCREAT) != 0) { vm_page_busy_release(m); *mp = NULL; return (VM_PAGER_FAIL); } } else if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } else if ((m = vm_page_alloc(object, pindex, pflags)) == NULL) { goto retrylookup; } vm_page_assert_xbusied(m); if (vm_pager_has_page(object, pindex, NULL, &after)) { after = MIN(after, VM_INITIAL_PAGEIN); after = MIN(after, allocflags >> VM_ALLOC_COUNT_SHIFT); after = MAX(after, 1); ma[0] = m; for (i = 1; i < after; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid || !vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, m->pindex + i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } after = i; vm_object_pip_add(object, after); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, ma, after, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeupn(object, after); /* Pager may have replaced a page. */ m = ma[0]; if (rv != VM_PAGER_OK) { for (i = 0; i < after; i++) { if (!vm_page_wired(ma[i])) vm_page_free(ma[i]); else vm_page_xunbusy(ma[i]); } *mp = NULL; return (rv); } for (i = 1; i < after; i++) vm_page_readahead_finish(ma[i]); MPASS(vm_page_all_valid(m)); } else { vm_page_zero_invalid(m, TRUE); } out: if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); if ((allocflags & VM_ALLOC_SBUSY) != 0 && vm_page_xbusied(m)) vm_page_busy_downgrade(m); else if ((allocflags & VM_ALLOC_NOBUSY) != 0) vm_page_busy_release(m); *mp = m; return (VM_PAGER_OK); } /* * Locklessly grab a valid page. If the page is not valid or not yet * allocated this will fall back to the object lock method. */ int vm_page_grab_valid_unlocked(vm_page_t *mp, vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int flags; int error; KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_valid_unlocked: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY " "mismatch")); KASSERT((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL | VM_ALLOC_ZERO)) == 0, ("vm_page_grab_valid_unlocked: Invalid flags 0x%X", allocflags)); /* * Attempt a lockless lookup and busy. We need at least an sbusy * before we can inspect the valid field and return a wired page. */ flags = allocflags & ~(VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (!vm_page_acquire_unlocked(object, pindex, NULL, mp, flags)) return (VM_PAGER_FAIL); if ((m = *mp) != NULL) { if (vm_page_all_valid(m)) { if ((allocflags & VM_ALLOC_WIRED) != 0) vm_page_wire(m); vm_page_grab_release(m, allocflags); return (VM_PAGER_OK); } vm_page_busy_release(m); } if ((allocflags & VM_ALLOC_NOCREAT) != 0) { *mp = NULL; return (VM_PAGER_FAIL); } VM_OBJECT_WLOCK(object); error = vm_page_grab_valid(mp, object, pindex, allocflags); VM_OBJECT_WUNLOCK(object); return (error); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT(count > 0, ("vm_page_grab_pages: invalid page count %d", count)); vm_page_grab_check(allocflags); pflags = vm_page_grab_pflags(allocflags); i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { if (!vm_page_tryacquire(m, allocflags)) { if (vm_page_grab_sleep(object, m, pindex + i, "grbmaw", allocflags, true)) goto retrylookup; break; } } else { if ((allocflags & VM_ALLOC_NOCREAT) != 0) break; m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & (VM_ALLOC_NOWAIT | VM_ALLOC_WAITFAIL)) != 0) break; goto retrylookup; } } if (vm_page_none_valid(m) && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } vm_page_grab_release(m, allocflags); ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Unlocked variant of vm_page_grab_pages(). This accepts the same flags * and will fall back to the locked variant to handle allocation. */ int vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, pred; int flags; int i; KASSERT(count > 0, ("vm_page_grab_pages_unlocked: invalid page count %d", count)); vm_page_grab_check(allocflags); /* * Modify flags for lockless acquire to hold the page until we * set it valid if necessary. */ flags = allocflags & ~VM_ALLOC_NOBUSY; pred = NULL; for (i = 0; i < count; i++, pindex++) { if (!vm_page_acquire_unlocked(object, pindex, pred, &m, flags)) return (i); if (m == NULL) break; if ((flags & VM_ALLOC_ZERO) != 0 && vm_page_none_valid(m)) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); } /* m will still be wired or busy according to flags. */ vm_page_grab_release(m, allocflags); pred = ma[i] = m; } if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0) return (i); count -= i; VM_OBJECT_WLOCK(object); i += vm_page_grab_pages(object, pindex, allocflags, &ma[i], count); VM_OBJECT_WUNLOCK(object); return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } void vm_page_bits_set(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t set) { #if PAGE_SIZE == 32768 atomic_set_64((uint64_t *)bits, set); #elif PAGE_SIZE == 16384 atomic_set_32((uint32_t *)bits, set); #elif (PAGE_SIZE == 8192) && defined(atomic_set_16) atomic_set_16((uint16_t *)bits, set); #elif (PAGE_SIZE == 4096) && defined(atomic_set_8) atomic_set_8((uint8_t *)bits, set); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_set_32((uint32_t *)addr, set << shift); #endif /* PAGE_SIZE */ } static inline void vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t clear) { #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)bits, clear); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)bits, clear); #elif (PAGE_SIZE == 8192) && defined(atomic_clear_16) atomic_clear_16((uint16_t *)bits, clear); #elif (PAGE_SIZE == 4096) && defined(atomic_clear_8) atomic_clear_8((uint8_t *)bits, clear); #else /* PAGE_SIZE <= 8192 */ uintptr_t addr; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, clear << shift); #endif /* PAGE_SIZE */ } static inline vm_page_bits_t vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits) { #if PAGE_SIZE == 32768 uint64_t old; old = *bits; while (atomic_fcmpset_64(bits, &old, newbits) == 0); return (old); #elif PAGE_SIZE == 16384 uint32_t old; old = *bits; while (atomic_fcmpset_32(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16) uint16_t old; old = *bits; while (atomic_fcmpset_16(bits, &old, newbits) == 0); return (old); #elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8) uint8_t old; old = *bits; while (atomic_fcmpset_8(bits, &old, newbits) == 0); return (old); #else /* PAGE_SIZE <= 4096*/ uintptr_t addr; uint32_t old, new, mask; int shift; addr = (uintptr_t)bits; /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_{set, swap, clear}_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); mask = VM_PAGE_BITS_ALL << shift; old = *bits; do { new = old & ~mask; new |= newbits << shift; } while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0); return (old >> shift); #endif /* PAGE_SIZE */ } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; vm_page_bits_t pagebits; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); } /* * Set the page dirty bits and free the invalid swap space if * present. Returns the previous dirty bits. */ vm_page_bits_t vm_page_set_dirty(vm_page_t m) { vm_page_bits_t old; VM_PAGE_OBJECT_BUSY_ASSERT(m); if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) { old = m->dirty; m->dirty = VM_PAGE_BITS_ALL; } else old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL); if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0) vm_pager_page_unswapped(m); return (old); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { vm_page_assert_busied(m); /* * If the page is xbusied and not write mapped we are the * only thread that can modify dirty bits. Otherwise, The pmap * layer can call vm_page_dirty() without holding a distinguished * lock. The combination of page busy and atomic operations * suffice to guarantee consistency of the page dirty field. */ if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else vm_page_bits_clear(m, &m->dirty, pagebits); } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; vm_page_assert_busied(m); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the PGA_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); if (vm_page_xbusied(m)) m->valid |= pagebits; else vm_page_bits_set(m, &m->valid, pagebits); #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; vm_page_aflag_clear(m, PGA_NOSYNC); } else if (oldvalid != VM_PAGE_BITS_ALL && vm_page_xbusied(m)) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; /* * The object lock is required so that pages can't be mapped * read-only while we're in the process of invalidating them. */ object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); vm_page_assert_busied(m); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && vm_page_all_valid(m) && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && vm_page_all_valid(m)) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); if (vm_page_xbusied(m)) { m->valid &= ~bits; m->dirty &= ~bits; } else { vm_page_bits_clear(m, &m->valid, bits); vm_page_bits_clear(m, &m->dirty, bits); } } /* * vm_page_invalid: * * Invalidates the entire page. The page must be busy, unmapped, and * the enclosing object must be locked. The object locks protects * against concurrent read-only pmap enter which is done without * busy. */ void vm_page_invalid(vm_page_t m) { vm_page_assert_busied(m); VM_OBJECT_ASSERT_WLOCKED(m->object); MPASS(!pmap_page_is_mapped(m)); if (vm_page_xbusied(m)) m->valid = 0; else vm_page_bits_clear(m, &m->valid, VM_PAGE_BITS_ALL); } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistency * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) vm_page_valid(m); } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. * * Some callers envoke this routine without the busy lock held and * handle races via higher level locks. Typical callers should * hold a busy lock to prevent invalidation. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { vm_page_assert_busied(m); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_valid(vm_page_t m) { vm_page_assert_busied(m); if (vm_page_xbusied(m)) m->valid = VM_PAGE_BITS_ALL; else vm_page_bits_set(m, &m->valid, VM_PAGE_BITS_ALL); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_busy_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of a page or object busy. */ if (m->object != NULL && !vm_page_busied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint16_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_BUSY(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; virt = strchr(modif, 'v') != NULL; if (virt) m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->a.queue, m->ref_count, m->a.flags, m->oflags, m->flags, m->a.act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 74439d5884ef..bb12a7e335d5 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1,2423 +1,2419 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); /* Pagedaemon activity rates, in subdivisions of one second. */ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 10 static int swapdev_enabled; int vm_pageout_page_count = 32; static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "Panic on the given number of out-of-memory errors instead of " "killing the largest process"); static int vm_pageout_update_period; SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); static int pageout_cpus_per_thread = 16; SYSCTL_INT(_vm, OID_AUTO, pageout_cpus_per_thread, CTLFLAG_RDTUN, &pageout_cpus_per_thread, 0, "Number of CPUs per pagedaemon worker thread"); static int lowmem_period = 10; SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); static int disable_swap_pageouts; SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); static int vm_pageout_oom_seq = 12; SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); static int act_scan_laundry_weight = 3; SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, "background laundering rate, in kilobytes per second"); static u_int vm_background_launder_max = 20 * 1024; SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); u_long vm_page_max_user_wired; SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW, &vm_page_max_user_wired, 0, "system-wide limit to user-wired page count"); static u_int isqrt(u_int num); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); struct scan_state { struct vm_batchqueue bq; struct vm_pagequeue *pq; vm_page_t marker; int maxscan; int scanned; }; static void vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, vm_page_t marker, vm_page_t after, int maxscan) { vm_pagequeue_assert_locked(pq); KASSERT((marker->a.flags & PGA_ENQUEUED) == 0, ("marker %p already enqueued", marker)); if (after == NULL) TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q); else TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q); vm_page_aflag_set(marker, PGA_ENQUEUED); vm_batchqueue_init(&ss->bq); ss->pq = pq; ss->marker = marker; ss->maxscan = maxscan; ss->scanned = 0; vm_pagequeue_unlock(pq); } static void vm_pageout_end_scan(struct scan_state *ss) { struct vm_pagequeue *pq; pq = ss->pq; vm_pagequeue_assert_locked(pq); KASSERT((ss->marker->a.flags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); vm_page_aflag_clear(ss->marker, PGA_ENQUEUED); pq->pq_pdpages += ss->scanned; } /* * Add a small number of queued pages to a batch queue for later processing * without the corresponding queue lock held. The caller must have enqueued a * marker page at the desired start point for the scan. Pages will be * physically dequeued if the caller so requests. Otherwise, the returned * batch may contain marker pages, and it is up to the caller to handle them. * * When processing the batch queue, vm_pageout_defer() must be used to * determine whether the page has been logically dequeued since the batch was * collected. */ static __always_inline void vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) { struct vm_pagequeue *pq; vm_page_t m, marker, n; marker = ss->marker; pq = ss->pq; KASSERT((marker->a.flags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); vm_pagequeue_lock(pq); for (m = TAILQ_NEXT(marker, plinks.q); m != NULL && ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE; m = n, ss->scanned++) { n = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) == 0) { KASSERT((m->a.flags & PGA_ENQUEUED) != 0, ("page %p not enqueued", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in page queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in page queue", m)); } else if (dequeue) continue; (void)vm_batchqueue_insert(&ss->bq, m); if (dequeue) { TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_page_aflag_clear(m, PGA_ENQUEUED); } } TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q); if (__predict_true(m != NULL)) TAILQ_INSERT_BEFORE(m, marker, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q); if (dequeue) vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt); vm_pagequeue_unlock(pq); } /* * Return the next page to be scanned, or NULL if the scan is complete. */ static __always_inline vm_page_t vm_pageout_next(struct scan_state *ss, const bool dequeue) { if (ss->bq.bq_cnt == 0) vm_pageout_collect_batch(ss, dequeue); return (vm_batchqueue_pop(&ss->bq)); } /* * Determine whether processing of a page should be deferred and ensure that any * outstanding queue operations are processed. */ static __always_inline bool vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued) { vm_page_astate_t as; as = vm_page_astate_load(m); if (__predict_false(as.queue != queue || ((as.flags & PGA_ENQUEUED) != 0) != enqueued)) return (true); if ((as.flags & PGA_QUEUE_OP_MASK) != 0) { vm_page_pqbatch_submit(m, queue); return (true); } return (false); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; vm_page_assert_xbusied(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_tryxbusy(p) == 0) { ib = 0; break; } if (vm_page_wired(p)) { ib = 0; vm_page_xunbusy(p); break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; vm_page_xunbusy(p); break; } if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); ib = 0; break; } mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_tryxbusy(p) == 0) break; if (vm_page_wired(p)) { vm_page_xunbusy(p); break; } vm_page_test_dirty(p); if (p->dirty == 0) { vm_page_xunbusy(p); break; } if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); break; } mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Mark the pages shared busy and verify that they're * valid and read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(vm_page_all_valid(mc[i]), ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); KASSERT((mc[i]->a.flags & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_busy_downgrade(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: /* * The page may have moved since laundering started, in * which case it should be left alone. */ if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * The page is outside the object's range. We pretend * that the page out worked and clean the page, so the * changes will be lost if the page is reclaimed by * the page daemon. */ vm_page_undirty(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If the page couldn't be paged out to swap because the * pager wasn't able to find space, place the page in * the PQ_UNSWAPPABLE holding queue. This is an * optimization that prevents the page daemon from * wasting CPU cycles on pages that cannot be reclaimed * because no swap device is configured. * * Otherwise, reactivate the page so that it doesn't * clog the laundry and inactive queues. (We will try * paging it out again later.) */ if ((object->flags & OBJ_SWAP) != 0 && pageout_status[i] == VM_PAGER_FAIL) { vm_page_unswappable(mt); numpagedout++; } else vm_page_activate(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } static void vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) { atomic_store_rel_int(&swapdev_enabled, 1); } static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) { if (swap_pager_nswapdev() == 1) atomic_store_rel_int(&swapdev_enabled, 0); } /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_xunbusy(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); if (vget(vp, vn_lktype_write(NULL, vp) | LK_TIMELOCK) != 0) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); /* * Ensure that the object and vnode were not disassociated * while locks were dropped. */ if (vp->v_object != object) { error = ENOENT; goto unlock_all; } /* * While the object was unlocked, the page may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { error = ENXIO; goto unlock_all; } /* * The page may have been busied while the object lock was * released. */ if (vm_page_tryxbusy(m) == 0) { error = EBUSY; goto unlock_all; } } /* * Remove all writeable mappings, failing if the page is wired. */ if (!vm_page_try_remove_write(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock_all; } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * Attempt to launder the specified number of pages. * * Returns the number of pages successfully laundered. */ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct scan_state ss; struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, marker; vm_page_astate_t new, old; int act_delta, error, numpagedout, queue, refs, starting_target; int vnodes_skipped; bool pageout_ok; object = NULL; starting_target = launder; vnodes_skipped = 0; /* * Scan the laundry queues for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once * we've reached the end of the queue. A single iteration of this loop * may cause more than one page to be laundered because of clustering. * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) queue = PQ_UNSWAPPABLE; else queue = PQ_LAUNDRY; scan: marker = &vmd->vmd_markers[queue]; pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false((m->flags & PG_MARKER) != 0)) continue; /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, queue, true)) continue; /* * Lock the page's object. */ if (object == NULL || object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* The page is being freed by another thread. */ continue; /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); if (__predict_false(m->object != object)) { VM_OBJECT_WUNLOCK(object); object = NULL; continue; } } if (vm_page_tryxbusy(m) == 0) continue; /* * Check for wirings now that we hold the object lock and have * exclusively busied the page. If the page is mapped, it may * still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase after this check. */ if (__predict_false(vm_page_wired(m))) goto skip_page; /* * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ if (vm_page_none_valid(m)) goto free_page; refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; for (old = vm_page_astate_load(m);;) { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) goto skip_page; new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta == 0) { ; } else if (object->ref_count != 0) { /* * Increase the activation count if the page was * referenced while in the laundry queue. This * makes it less likely that the page will be * returned prematurely to the laundry queue. */ new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; if (!vm_page_pqstate_commit(m, &old, new)) continue; /* * If this was a background laundering, count * activated pages towards our target. The * purpose of background laundering is to ensure * that pages are eventually cycled through the * laundry queue, and an activation is a valid * way out. */ if (!in_shortfall) launder--; VM_CNT_INC(v_reactivated); goto skip_page; } else if ((object->flags & OBJ_DEAD) == 0) { new.flags |= PGA_REQUEUE; if (!vm_page_pqstate_commit(m, &old, new)) continue; goto skip_page; } break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) goto skip_page; } /* * Clean pages are freed, and dirty pages are paged out unless * they belong to a dead object. Requeueing dirty pages from * dead objects is pointless, as they are being paged out and * freed by the thread that destroyed the object. */ if (m->dirty == 0) { free_page: /* * Now we are guaranteed that no other threads are * manipulating the page, check for a last-second * reference. */ if (vm_pageout_defer(m, queue, true)) goto skip_page; vm_page_free(m); VM_CNT_INC(v_dfree); } else if ((object->flags & OBJ_DEAD) == 0) { - if ((object->flags & OBJ_SWAP) == 0 && - object->type != OBJT_DEFAULT) - pageout_ok = true; - else if (disable_swap_pageouts) - pageout_ok = false; + if ((object->flags & OBJ_SWAP) != 0) + pageout_ok = disable_swap_pageouts == 0; else pageout_ok = true; if (!pageout_ok) { vm_page_launder(m); goto skip_page; } /* * Form a cluster with adjacent, dirty pages from the * same object, and page out that entire cluster. * * The adjacent, dirty pages must also be in the * laundry. However, their mappings are not checked * for new references. Consequently, a recently * referenced page may be paged out. However, that * page will not be prematurely reclaimed. After page * out, the page will be placed in the inactive queue, * where any new references will be detected and the * page reactivated. */ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; ss.scanned += numpagedout; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } object = NULL; } else { skip_page: vm_page_xunbusy(m); } } if (object != NULL) { VM_OBJECT_WUNLOCK(object); object = NULL; } vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); if (launder > 0 && queue == PQ_UNSWAPPABLE) { queue = PQ_LAUNDRY; goto scan; } /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ if (vnodes_skipped > 0 && launder > 0) (void)speedup_syncer(); return (starting_target - launder); } /* * Compute the integer square root. */ static u_int isqrt(u_int num) { u_int bit, root, tmp; bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0; root = 0; while (bit != 0) { tmp = root + bit; root >>= 1; if (num >= tmp) { num -= tmp; root += bit; } bit >>= 2; } return (root); } /* * Perform the work of the laundry thread: periodically wake up and determine * whether any pages need to be laundered. If so, determine the number of pages * that need to be laundered, and launder them. */ static void vm_pageout_laundry_worker(void *arg) { struct vm_domain *vmd; struct vm_pagequeue *pq; uint64_t nclean, ndirty, nfreed; int domain, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(vmd->vmd_segs != 0, ("domain without segments")); shortfall = 0; in_shortfall = false; shortfall_cycle = 0; last_target = target = 0; nfreed = 0; /* * Calls to these handlers are serialized by the swap syscall lock. */ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd, EVENTHANDLER_PRI_ANY); (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd, EVENTHANDLER_PRI_ANY); /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { KASSERT(target >= 0, ("negative target %d", target)); KASSERT(shortfall_cycle >= 0, ("negative cycle %d", shortfall_cycle)); launder = 0; /* * First determine whether we need to launder pages to meet a * shortage of free pages. */ if (shortfall > 0) { in_shortfall = true; shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run * (and we are no longer in shortfall) or we have met * our laundry target through other activity, then we * can stop laundering pages. */ in_shortfall = false; target = 0; goto trybackground; } launder = target / shortfall_cycle--; goto dolaundry; /* * There's no immediate need to launder any pages; see if we * meet the conditions to perform background laundering: * * 1. The ratio of dirty to clean inactive pages exceeds the * background laundering threshold, or * 2. we haven't yet reached the target of the current * background laundering run. * * The background laundering threshold is not a constant. * Instead, it is a slowly growing function of the number of * clean pages freed by the page daemon since the last * background laundering. Thus, as the ratio of dirty to * clean inactive pages grows, the amount of memory pressure * required to trigger laundering decreases. We ensure * that the threshold is non-zero after an inactive queue * scan, even if that scan failed to free a single clean page. */ trybackground: nclean = vmd->vmd_free_count + vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt; ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt; if (target == 0 && ndirty * isqrt(howmany(nfreed + 1, vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) { target = vmd->vmd_background_launder_target; } /* * We have a non-zero background laundering target. If we've * laundered up to our maximum without observing a page daemon * request, just stop. This is a safety belt that ensures we * don't launder an excessive amount if memory pressure is low * and the ratio of dirty to clean pages is large. Otherwise, * proceed at the background laundering rate. */ if (target > 0) { if (nfreed > 0) { nfreed = 0; last_target = target; } else if (last_target - target >= vm_background_launder_max * PAGE_SIZE / 1024) { target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } dolaundry: if (launder > 0) { /* * Because of I/O clustering, the number of laundered * pages could exceed "target" by the maximum size of * a cluster minus one. */ target -= min(vm_pageout_launder(vmd, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } /* * If we're not currently laundering pages and the page daemon * hasn't posted a new request, sleep until the page daemon * kicks us. */ vm_pagequeue_lock(pq); if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE) (void)mtx_sleep(&vmd->vmd_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* * If the pagedaemon has indicated that it's in shortfall, start * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { shortfall = vm_laundry_target(vmd) + vmd->vmd_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) vmd->vmd_laundry_request = VM_LAUNDRY_IDLE; nfreed += vmd->vmd_clean_pages_freed; vmd->vmd_clean_pages_freed = 0; vm_pagequeue_unlock(pq); } } /* * Compute the number of pages we want to try to move from the * active queue to either the inactive or laundry queue. * * When scanning active pages during a shortage, we make clean pages * count more heavily towards the page shortage than dirty pages. * This is because dirty pages must be laundered before they can be * reused and thus have less utility when attempting to quickly * alleviate a free page shortage. However, this weighting also * causes the scan to deactivate dirty pages more aggressively, * improving the effectiveness of clustering. */ static int vm_pageout_active_target(struct vm_domain *vmd) { int shortage; shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) - (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt + vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight); shortage *= act_scan_laundry_weight; return (shortage); } /* * Scan the active queue. If there is no shortage of inactive pages, scan a * small portion of the queue in order to maintain quasi-LRU. */ static void vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) { struct scan_state ss; vm_object_t object; vm_page_t m, marker; struct vm_pagequeue *pq; vm_page_astate_t old, new; long min_scan; int act_delta, max_scan, ps_delta, refs, scan_tick; uint8_t nqueue; marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. * * To avoid requeuing each page that remains in the active queue, we * implement the CLOCK algorithm. To keep the implementation of the * enqueue operation consistent for all page queues, we use two hands, * represented by marker pages. Scans begin at the first hand, which * precedes the second hand in the queue. When the two hands meet, * they are moved back to the head and tail of the queue, respectively, * and scanning resumes. */ max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan; act_scan: vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan); while ((m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false(m == &vmd->vmd_clock[1])) { vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); max_scan -= ss.scanned; vm_pageout_end_scan(&ss); goto act_scan; } if (__predict_false((m->flags & PG_MARKER) != 0)) continue; /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, PQ_ACTIVE, true)) continue; /* * A page's object pointer may be set to NULL before * the object lock is acquired. */ object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* * The page has been removed from its object. */ continue; /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0 && VM_OBJECT_TRYWLOCK(object)) { if (m->object == object) vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(object); } /* * Check to see "how much" the page has been used. * * Test PGA_REFERENCED after calling pmap_ts_referenced() so * that a reference from a concurrently destroyed mapping is * observed here and now. * * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; old = vm_page_astate_load(m); do { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) { ps_delta = 0; break; } /* * Advance or decay the act_count based on recent usage. */ new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta != 0) { new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; } else { new.act_count -= min(new.act_count, ACT_DECLINE); } if (new.act_count > 0) { /* * Adjust the activation count and keep the page * in the active queue. The count might be left * unchanged if it is saturated. The page may * have been moved to a different queue since we * started the scan, in which case we move it * back. */ ps_delta = 0; if (old.queue != PQ_ACTIVE) { new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; } } else { /* * When not short for inactive pages, let dirty * pages go through the inactive queue before * moving to the laundry queue. This gives them * some extra time to be reactivated, * potentially avoiding an expensive pageout. * However, during a page shortage, the inactive * queue is necessarily small, and so dirty * pages would only spend a trivial amount of * time in the inactive queue. Therefore, we * might as well place them directly in the * laundry queue to reduce queuing overhead. * * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, * directing dirty pages into the laundry queue * is only an optimization and not a * requirement. Therefore, we simply rely on * the opportunistic updates to the page's dirty * field by the pmap. */ if (page_shortage <= 0) { nqueue = PQ_INACTIVE; ps_delta = 0; } else if (m->dirty == 0) { nqueue = PQ_INACTIVE; ps_delta = act_scan_laundry_weight; } else { nqueue = PQ_LAUNDRY; ps_delta = 1; } new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); page_shortage -= ps_delta; } vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); } static int vm_pageout_reinsert_inactive_page(struct vm_pagequeue *pq, vm_page_t marker, vm_page_t m) { vm_page_astate_t as; vm_pagequeue_assert_locked(pq); as = vm_page_astate_load(m); if (as.queue != PQ_INACTIVE || (as.flags & PGA_ENQUEUED) != 0) return (0); vm_page_aflag_set(m, PGA_ENQUEUED); TAILQ_INSERT_BEFORE(marker, m, plinks.q); return (1); } /* * Re-add stuck pages to the inactive queue. We will examine them again * during the next scan. If the queue state of a page has changed since * it was physically removed from the page queue in * vm_pageout_collect_batch(), don't do anything with that page. */ static void vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq, vm_page_t m) { struct vm_pagequeue *pq; vm_page_t marker; int delta; delta = 0; marker = ss->marker; pq = ss->pq; if (m != NULL) { if (vm_batchqueue_insert(bq, m)) return; vm_pagequeue_lock(pq); delta += vm_pageout_reinsert_inactive_page(pq, marker, m); } else vm_pagequeue_lock(pq); while ((m = vm_batchqueue_pop(bq)) != NULL) delta += vm_pageout_reinsert_inactive_page(pq, marker, m); vm_pagequeue_cnt_add(pq, delta); vm_pagequeue_unlock(pq); vm_batchqueue_init(bq); } static void vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) { struct timeval start, end; struct scan_state ss; struct vm_batchqueue rq; struct vm_page marker_page; vm_page_t m, marker; struct vm_pagequeue *pq; vm_object_t object; vm_page_astate_t old, new; int act_delta, addl_page_shortage, starting_page_shortage, refs; object = NULL; vm_batchqueue_init(&rq); getmicrouptime(&start); /* * The addl_page_shortage is an estimate of the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->a.act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ starting_page_shortage = page_shortage; marker = &marker_page; vm_page_init_marker(marker, PQ_INACTIVE, 0); pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) { KASSERT((m->flags & PG_MARKER) == 0, ("marker page %p was dequeued", m)); /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, PQ_INACTIVE, false)) continue; /* * Lock the page's object. */ if (object == NULL || object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* The page is being freed by another thread. */ continue; /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); if (__predict_false(m->object != object)) { VM_OBJECT_WUNLOCK(object); object = NULL; goto reinsert; } } if (vm_page_tryxbusy(m) == 0) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; goto reinsert; } /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); /* * Check for wirings now that we hold the object lock and have * exclusively busied the page. If the page is mapped, it may * still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase after this check. */ if (__predict_false(vm_page_wired(m))) goto skip_page; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (vm_page_none_valid(m)) goto free_page; refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; for (old = vm_page_astate_load(m);;) { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) goto skip_page; new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta == 0) { ; } else if (object->ref_count != 0) { /* * Increase the activation count if the * page was referenced while in the * inactive queue. This makes it less * likely that the page will be returned * prematurely to the inactive queue. */ new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; if (!vm_page_pqstate_commit(m, &old, new)) continue; VM_CNT_INC(v_reactivated); goto skip_page; } else if ((object->flags & OBJ_DEAD) == 0) { new.queue = PQ_INACTIVE; new.flags |= PGA_REQUEUE; if (!vm_page_pqstate_commit(m, &old, new)) continue; goto skip_page; } break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) goto skip_page; } /* * Clean pages can be freed, but dirty pages must be sent back * to the laundry, unless they belong to a dead object. * Requeueing dirty pages from dead objects is pointless, as * they are being paged out and freed by the thread that * destroyed the object. */ if (m->dirty == 0) { free_page: /* * Now we are guaranteed that no other threads are * manipulating the page, check for a last-second * reference that would save it from doom. */ if (vm_pageout_defer(m, PQ_INACTIVE, false)) goto skip_page; /* * Because we dequeued the page and have already checked * for pending dequeue and enqueue requests, we can * safely disassociate the page from the inactive queue * without holding the queue lock. */ m->a.queue = PQ_NONE; vm_page_free(m); page_shortage--; continue; } if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); skip_page: vm_page_xunbusy(m); continue; reinsert: vm_pageout_reinsert_inactive(&ss, &rq, m); } if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_pageout_reinsert_inactive(&ss, &rq, NULL); vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL); vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); /* * Record the remaining shortage and the progress and rate it was made. */ atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage); getmicrouptime(&end); timevalsub(&end, &start); atomic_add_int(&vmd->vmd_inactive_us, end.tv_sec * 1000000 + end.tv_usec); atomic_add_int(&vmd->vmd_inactive_freed, starting_page_shortage - page_shortage); } /* * Dispatch a number of inactive threads according to load and collect the * results to present a coherent view of paging activity on this domain. */ static int vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage) { u_int freed, pps, slop, threads, us; vmd->vmd_inactive_shortage = shortage; slop = 0; /* * If we have more work than we can do in a quarter of our interval, we * fire off multiple threads to process it. */ threads = vmd->vmd_inactive_threads; if (threads > 1 && vmd->vmd_inactive_pps != 0 && shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) { vmd->vmd_inactive_shortage /= threads; slop = shortage % threads; vm_domain_pageout_lock(vmd); blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1); blockcount_acquire(&vmd->vmd_inactive_running, threads - 1); wakeup(&vmd->vmd_inactive_shortage); vm_domain_pageout_unlock(vmd); } /* Run the local thread scan. */ vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage + slop); /* * Block until helper threads report results and then accumulate * totals. */ blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM); freed = atomic_readandclear_int(&vmd->vmd_inactive_freed); VM_CNT_ADD(v_dfree, freed); /* * Calculate the per-thread paging rate with an exponential decay of * prior results. Careful to avoid integer rounding errors with large * us values. */ us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1); if (us > 1000000) /* Keep rounding to tenths */ pps = (freed * 10) / ((us * 10) / 1000000); else pps = (1000000 / us) * freed; vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2); return (shortage - freed); } /* * Attempt to reclaim the requested number of pages from the inactive queue. * Returns true if the shortage was addressed. */ static int vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage) { struct vm_pagequeue *pq; u_int addl_page_shortage, deficit, page_shortage; u_int starting_page_shortage; /* * vmd_pageout_deficit counts the number of pages requested in * allocations that failed because of a free page shortage. We assume * that the allocations will be reattempted and thus include the deficit * in our scan target. */ deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); starting_page_shortage = shortage + deficit; /* * Run the inactive scan on as many threads as is necessary. */ page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage); addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage); /* * Wake up the laundry thread so that it can perform any needed * laundering. If we didn't meet our target, we're in shortfall and * need to launder more aggressively. If PQ_LAUNDRY is empty and no * swap devices are configured, the laundry thread has no work to do, so * don't bother waking it up. * * The laundry thread uses the number of inactive queue scans elapsed * since the last laundering to determine whether to launder again, so * keep count. */ if (starting_page_shortage > 0) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE && (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { if (page_shortage > 0) { vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL; VM_CNT_INC(v_pdshortfalls); } else if (vmd->vmd_laundry_request != VM_LAUNDRY_SHORTFALL) vmd->vmd_laundry_request = VM_LAUNDRY_BACKGROUND; wakeup(&vmd->vmd_laundry_request); } vmd->vmd_clean_pages_freed += starting_page_shortage - page_shortage; vm_pagequeue_unlock(pq); } /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (page_shortage > 0) vm_swapout_run(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Reclaim pages by swapping out idle processes, if configured to do so. */ vm_swapout_run_idle(); /* * See the description of addl_page_shortage above. */ *addl_shortage = addl_page_shortage + deficit; return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; VM_MAP_ENTRY_FOREACH(entry, map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; - if (obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP || - obj->type == OBJT_PHYS || obj->type == OBJT_VNODE || + if (obj->type == OBJT_PHYS || obj->type == OBJT_VNODE || (obj->flags & OBJ_SWAP) != 0) res += obj->resident_page_count; } return (res); } static int vm_oom_ratelim_last; static int vm_oom_pf_secs = 10; SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0, ""); static struct mtx vm_oom_ratelim_mtx; void vm_pageout_oom(int shortage) { const char *reason; struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; int now; bool breakout; /* * For OOM requests originating from vm_fault(), there is a high * chance that a single large process faults simultaneously in * several threads. Also, on an active system running many * processes of middle-size, like buildworld, all of them * could fault almost simultaneously as well. * * To avoid killing too many processes, rate-limit OOMs * initiated by vm_fault() time-outs on the waits for free * pages. */ mtx_lock(&vm_oom_ratelim_mtx); now = ticks; if (shortage == VM_OOM_MEM_PF && (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) { mtx_unlock(&vm_oom_ratelim_mtx); return; } vm_oom_ratelim_last = now; mtx_unlock(&vm_oom_ratelim_mtx); /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = false; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = true; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { switch (shortage) { case VM_OOM_MEM: reason = "failed to reclaim memory"; break; case VM_OOM_MEM_PF: reason = "a thread waited too long to allocate a page"; break; case VM_OOM_SWAPZ: reason = "out of swap space"; break; default: panic("unknown OOM reason %d", shortage); } if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0) panic("%s", reason); PROC_LOCK(bigproc); killproc(bigproc, reason); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); } } /* * Signal a free page shortage to subsystems that have registered an event * handler. Reclaim memory from UMA in the event of a severe shortage. * Return true if the free page count should be re-evaluated. */ static bool vm_pageout_lowmem(void) { static int lowmem_ticks = 0; int last; bool ret; ret = false; last = atomic_load_int(&lowmem_ticks); while ((u_int)(ticks - last) / hz >= lowmem_period) { if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0) continue; /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(UMA_RECLAIM_TRIM); ret = true; break; } /* * Kick off an asynchronous reclaim of cached memory if one of the * page daemons is failing to keep up with demand. Use the "severe" * threshold instead of "min" to ensure that we do not blow away the * caches if a subset of the NUMA domains are depleted by kernel memory * allocations; the domainset iterators automatically skip domains * below the "min" threshold on the first pass. * * UMA reclaim worker has its own rate-limiting mechanism, so don't * worry about kicking it too often. */ if (vm_page_count_severe()) uma_reclaim_wakeup(); return (ret); } static void vm_pageout_worker(void *arg) { struct vm_domain *vmd; u_int ofree; int addl_shortage, domain, shortage; bool target_met; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); shortage = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vmd->vmd_last_active_scan = ticks; /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { vm_domain_pageout_lock(vmd); /* * We need to clear wanted before we check the limits. This * prevents races with wakers who will check wanted after they * reach the limit. */ atomic_store_int(&vmd->vmd_pageout_wanted, 0); /* * Might the page daemon need to run again? */ if (vm_paging_needed(vmd, vmd->vmd_free_count)) { /* * Yes. If the scan failed to produce enough free * pages, sleep uninterruptibly for some time in the * hope that the laundry thread will clean some pages. */ vm_domain_pageout_unlock(vmd); if (!target_met) pause("pwait", hz / VM_INACT_SCAN_RATE); } else { /* * No, sleep until the next wakeup or until pages * need to have their reference stats updated. */ if (mtx_sleep(&vmd->vmd_pageout_wanted, vm_domain_pageout_lockptr(vmd), PDROP | PVM, "psleep", hz / VM_INACT_SCAN_RATE) == 0) VM_CNT_INC(v_pdwakeups); } /* Prevent spurious wakeups by ensuring that wanted is set. */ atomic_store_int(&vmd->vmd_pageout_wanted, 1); /* * Use the controller to calculate how many pages to free in * this interval, and scan the inactive queue. If the lowmem * handlers appear to have freed up some pages, subtract the * difference from the inactive queue scan target. */ shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count); if (shortage > 0) { ofree = vmd->vmd_free_count; if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree) shortage -= min(vmd->vmd_free_count - ofree, (u_int)shortage); target_met = vm_pageout_inactive(vmd, shortage, &addl_shortage); } else addl_shortage = 0; /* * Scan the active queue. A positive value for shortage * indicates that we must aggressively deactivate pages to avoid * a shortfall. */ shortage = vm_pageout_active_target(vmd) + addl_shortage; vm_pageout_scan_active(vmd, shortage); } } /* * vm_pageout_helper runs additional pageout daemons in times of high paging * activity. */ static void vm_pageout_helper(void *arg) { struct vm_domain *vmd; int domain; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); vm_domain_pageout_lock(vmd); for (;;) { msleep(&vmd->vmd_inactive_shortage, vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0); blockcount_release(&vmd->vmd_inactive_starting, 1); vm_domain_pageout_unlock(vmd); vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage); vm_domain_pageout_lock(vmd); /* * Release the running count while the pageout lock is held to * prevent wakeup races. */ blockcount_release(&vmd->vmd_inactive_running, 1); } } static int get_pageout_threads_per_domain(const struct vm_domain *vmd) { unsigned total_pageout_threads, eligible_cpus, domain_cpus; if (VM_DOMAIN_EMPTY(vmd->vmd_domain)) return (0); /* * Semi-arbitrarily constrain pagedaemon threads to less than half the * total number of CPUs in the system as an upper limit. */ if (pageout_cpus_per_thread < 2) pageout_cpus_per_thread = 2; else if (pageout_cpus_per_thread > mp_ncpus) pageout_cpus_per_thread = mp_ncpus; total_pageout_threads = howmany(mp_ncpus, pageout_cpus_per_thread); domain_cpus = CPU_COUNT(&cpuset_domain[vmd->vmd_domain]); /* Pagedaemons are not run in empty domains. */ eligible_cpus = mp_ncpus; for (unsigned i = 0; i < vm_ndomains; i++) if (VM_DOMAIN_EMPTY(i)) eligible_cpus -= CPU_COUNT(&cpuset_domain[i]); /* * Assign a portion of the total pageout threads to this domain * corresponding to the fraction of pagedaemon-eligible CPUs in the * domain. In asymmetric NUMA systems, domains with more CPUs may be * allocated more threads than domains with fewer CPUs. */ return (howmany(total_pageout_threads * domain_cpus, eligible_cpus)); } /* * Initialize basic pageout daemon settings. See the comment above the * definition of vm_domain for some explanation of how these thresholds are * used. */ static void vm_pageout_init_domain(int domain) { struct vm_domain *vmd; struct sysctl_oid *oid; vmd = VM_DOMAIN(domain); vmd->vmd_interrupt_free_min = 2; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE + vmd->vmd_interrupt_free_min; vmd->vmd_free_reserved = vm_pageout_page_count + vmd->vmd_pageout_free_min + vmd->vmd_page_count / 768; vmd->vmd_free_min = vmd->vmd_page_count / 200; vmd->vmd_free_severe = vmd->vmd_free_min / 2; vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved; vmd->vmd_free_min += vmd->vmd_free_reserved; vmd->vmd_free_severe += vmd->vmd_free_reserved; vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2; if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3) vmd->vmd_inactive_target = vmd->vmd_free_count / 3; /* * Set the default wakeup threshold to be 10% below the paging * target. This keeps the steady state out of shortfall. */ vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9; /* * Target amount of memory to move out of the laundry queue during a * background laundering. This is proportional to the amount of system * memory. */ vmd->vmd_background_launder_target = (vmd->vmd_free_target - vmd->vmd_free_min) / 10; /* Initialize the pageout daemon pid controller. */ pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE, vmd->vmd_free_target, PIDCTRL_BOUND, PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid)); vmd->vmd_inactive_threads = get_pageout_threads_per_domain(vmd); } static void vm_pageout_init(void) { u_long freecount; int i; /* * Initialize some paging parameters. */ if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; freecount = 0; for (i = 0; i < vm_ndomains; i++) { struct vm_domain *vmd; vm_pageout_init_domain(i); vmd = VM_DOMAIN(i); vm_cnt.v_free_reserved += vmd->vmd_free_reserved; vm_cnt.v_free_target += vmd->vmd_free_target; vm_cnt.v_free_min += vmd->vmd_free_min; vm_cnt.v_inactive_target += vmd->vmd_inactive_target; vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min; vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min; vm_cnt.v_free_severe += vmd->vmd_free_severe; freecount += vmd->vmd_free_count; } /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* * Set the maximum number of user-wired virtual pages. Historically the * main source of such pages was mlock(2) and mlockall(2). Hypervisors * may also request user-wired memory. */ if (vm_page_max_user_wired == 0) vm_page_max_user_wired = 4 * freecount / 5; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { struct proc *p; struct thread *td; int error, first, i, j, pageout_threads; p = curproc; td = curthread; mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); for (first = -1, i = 0; i < vm_ndomains; i++) { if (VM_DOMAIN_EMPTY(i)) { if (bootverbose) printf("domain %d empty; skipping pageout\n", i); continue; } if (first == -1) first = i; else { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i); if (error != 0) panic("starting pageout for domain %d: %d\n", i, error); } pageout_threads = VM_DOMAIN(i)->vmd_inactive_threads; for (j = 0; j < pageout_threads - 1; j++) { error = kthread_add(vm_pageout_helper, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d helper%d", i, j); if (error != 0) panic("starting pageout helper %d for domain " "%d: %d\n", j, i, error); } error = kthread_add(vm_pageout_laundry_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i); if (error != 0) panic("starting laundry for domain %d: %d", i, error); } error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); snprintf(td->td_name, sizeof(td->td_name), "dom%d", first); vm_pageout_worker((void *)(uintptr_t)first); } /* * Perform an advisory wakeup of the page daemon. */ void pagedaemon_wakeup(int domain) { struct vm_domain *vmd; vmd = VM_DOMAIN(domain); vm_domain_pageout_assert_unlocked(vmd); if (curproc == pageproc) return; if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) { vm_domain_pageout_lock(vmd); atomic_store_int(&vmd->vmd_pageout_wanted, 1); wakeup(&vmd->vmd_pageout_wanted); vm_domain_pageout_unlock(vmd); } }