Index: stable/12/sys/vm/vm_extern.h =================================================================== --- stable/12/sys/vm/vm_extern.h (revision 365006) +++ stable/12/sys/vm/vm_extern.h (revision 365007) @@ -1,134 +1,134 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 * $FreeBSD$ */ #ifndef _VM_EXTERN_H_ #define _VM_EXTERN_H_ struct pmap; struct proc; struct vmspace; struct vnode; struct vmem; #ifdef _KERNEL struct cdev; struct cdevsw; struct domainset; /* These operate on kernel virtual addresses only. */ vm_offset_t kva_alloc(vm_size_t); void kva_free(vm_offset_t, vm_size_t); /* These operate on pageable virtual addresses. */ vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t); void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t); /* These operate on virtual addresses backed by memory. */ vm_offset_t kmem_alloc_attr(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr); vm_offset_t kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr); vm_offset_t kmem_alloc_contig(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_offset_t kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr); vm_offset_t kmem_malloc(vm_size_t size, int flags); vm_offset_t kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags); void kmem_free(vm_offset_t addr, vm_size_t size); /* This provides memory for previously allocated address space. */ int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int); int kmem_back_domain(int, vm_object_t, vm_offset_t, vm_size_t, int); void kmem_unback(vm_object_t, vm_offset_t, vm_size_t); /* Bootstrapping. */ void kmem_bootstrap_free(vm_offset_t, vm_size_t); -void kmem_subinit(vm_map_t, vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t, - bool); +vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t, + boolean_t); void kmem_init(vm_offset_t, vm_offset_t); void kmem_init_zero_region(void); void kmeminit(void); int kernacc(void *, int, int); int useracc(void *, int, int); int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t, vm_ooffset_t *); int vm_fault_disable_pagefaults(void); void vm_fault_enable_pagefaults(int save); int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count); int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, int *signo, int *ucode); int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int); void vm_waitproc(struct proc *); int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t); int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *); int vm_mmap_to_errno(int rv); int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *); int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); void vm_set_page_size(void); void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t); typedef int (*pmap_pinit_t)(struct pmap *pmap); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *); int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); int vmspace_unshare(struct proc *); void vmspace_exit(struct thread *); struct vmspace *vmspace_acquire_ref(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); void vmspace_switch_aio(struct vmspace *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset); void vm_imgact_unmap_page(struct sf_buf *sf); void vm_thread_dispose(struct thread *td); int vm_thread_new(struct thread *td, int pages); u_int vm_active_count(void); u_int vm_inactive_count(void); u_int vm_laundry_count(void); u_int vm_wait_count(void); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Index: stable/12/sys/vm/vm_init.c =================================================================== --- stable/12/sys/vm/vm_init.c (revision 365006) +++ stable/12/sys/vm/vm_init.c (revision 365007) @@ -1,280 +1,280 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_init.c 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Initialize the Virtual Memory subsystem. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern void uma_startup1(void); extern void uma_startup2(void); extern void vm_radix_reserve_kva(void); long physmem; /* * System initialization */ static void vm_mem_init(void *); SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL); /* * vm_init initializes the virtual memory system. * This is done only by the first cpu up. * * The start and end address of physical memory is passed in. */ static void vm_mem_init(void *dummy) { /* * Initialize static domainsets, used by various allocators. */ domainset_init(); /* * Initialize resident memory structures. From here on, all physical * memory is accounted for, and we use only virtual addresses. */ vm_set_page_size(); virtual_avail = vm_page_startup(virtual_avail); /* * Set an initial domain policy for thread0 so that allocations * can work. */ domainset_zero(); #ifdef UMA_MD_SMALL_ALLOC /* Announce page availability to UMA. */ uma_startup1(); #endif /* * Initialize other VM packages */ vmem_startup(); vm_object_init(); vm_map_startup(); kmem_init(virtual_avail, virtual_end); #ifndef UMA_MD_SMALL_ALLOC /* Set up radix zone to use noobj_alloc. */ vm_radix_reserve_kva(); #endif /* Announce full page availability to UMA. */ uma_startup2(); kmem_init_zero_region(); pmap_init(); vm_pager_init(); } void vm_ksubmap_init(struct kva_md_info *kmi) { vm_offset_t firstaddr; caddr_t v; vm_size_t size = 0; long physmem_est; vm_offset_t minaddr; vm_offset_t maxaddr; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; /* * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ physmem_est = lmin(physmem, btoc(vm_map_max(kernel_map) - vm_map_min(kernel_map))); v = kern_vfs_bio_buffer_alloc(v, physmem_est); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)v; #ifdef VM_FREELIST_DMA32 /* * Try to protect 32-bit DMAable memory from the largest * early alloc of wired mem. */ firstaddr = kmem_alloc_attr(size, M_ZERO | M_NOWAIT, (vm_paddr_t)1 << 32, ~(vm_paddr_t)0, VM_MEMATTR_DEFAULT); if (firstaddr == 0) #endif firstaddr = kmem_malloc(size, M_ZERO | M_WAITOK); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)((char *)v - firstaddr) != size) panic("startup: table size inconsistency"); /* * Allocate the clean map to hold all of the paging and I/O virtual * memory. */ size = (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS + (long)bio_transient_maxcnt * MAXPHYS; kmi->clean_sva = firstaddr = kva_alloc(size); kmi->clean_eva = firstaddr + size; /* * Allocate the buffer arena. * * Enable the quantum cache if we have more than 4 cpus. This * avoids lock contention at the expense of some fragmentation. */ size = (long)nbuf * BKVASIZE; kmi->buffer_sva = firstaddr; kmi->buffer_eva = kmi->buffer_sva + size; vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size, PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0); firstaddr += size; /* * Now swap kva. */ swapbkva = firstaddr; size = (long)nswbuf * MAXPHYS; firstaddr += size; /* * And optionally transient bio space. */ if (bio_transient_maxcnt != 0) { size = (long)bio_transient_maxcnt * MAXPHYS; vmem_init(transient_arena, "transient arena", firstaddr, size, PAGE_SIZE, 0, 0); firstaddr += size; } if (firstaddr != kmi->clean_eva) panic("Clean map calculation incorrect"); /* * Allocate the pageable submaps. We may cache an exec map entry per * CPU, so we therefore need to reserve space for at least ncpu+1 * entries to avoid deadlock. The exec map is also used by some image * activators, so we leave a fixed number of pages for their use. */ #ifdef __LP64__ exec_map_entries = 8 * mp_ncpus; #else exec_map_entries = 2 * mp_ncpus + 4; #endif exec_map_entry_size = round_page(PATH_MAX + ARG_MAX); - kmem_subinit(exec_map, kernel_map, &minaddr, &maxaddr, - exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, false); - kmem_subinit(pipe_map, kernel_map, &minaddr, &maxaddr, maxpipekva, - false); + exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, + exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, FALSE); + pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva, + FALSE); } Index: stable/12/sys/vm/vm_kern.c =================================================================== --- stable/12/sys/vm/vm_kern.c (revision 365006) +++ stable/12/sys/vm/vm_kern.c (revision 365007) @@ -1,855 +1,861 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Kernel memory management. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include /* for ticks and hz */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -struct vm_map kernel_map_store; -struct vm_map exec_map_store; -struct vm_map pipe_map_store; +vm_map_t kernel_map; +vm_map_t exec_map; +vm_map_t pipe_map; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); /* NB: Used by kernel debuggers. */ const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS; u_int exec_map_entry_size; u_int exec_map_entries; SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD, #if defined(__arm__) || defined(__sparc64__) &vm_max_kernel_address, 0, #else SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS, #endif "Max kernel address"); #if VM_NRESERVLEVEL > 0 #define KVA_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT) #else /* On non-superpage architectures we want large import sizes. */ #define KVA_QUANTUM_SHIFT (8 + PAGE_SHIFT) #endif #define KVA_QUANTUM (1 << KVA_QUANTUM_SHIFT) /* * kva_alloc: * * Allocate a virtual address range with no underlying object and * no initial mapping to physical memory. Any mapping from this * range to physical memory must be explicitly created prior to * its use, typically with pmap_qenter(). Any attempt to create * a mapping on demand through vm_fault() will result in a panic. */ vm_offset_t kva_alloc(vm_size_t size) { vm_offset_t addr; size = round_page(size); if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr)) return (0); return (addr); } /* * kva_free: * * Release a region of kernel virtual memory allocated * with kva_alloc, and return the physical pages * associated with that region. * * This routine may not block on kernel maps. */ void kva_free(vm_offset_t addr, vm_size_t size) { size = round_page(size); vmem_free(kernel_arena, addr, size); } static vm_page_t kmem_alloc_contig_pages(vm_object_t object, vm_pindex_t pindex, int domain, int pflags, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vm_page_t m; int tries; bool wait; VM_OBJECT_ASSERT_WLOCKED(object); wait = (pflags & VM_ALLOC_WAITOK) != 0; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); pflags |= VM_ALLOC_NOWAIT; for (tries = wait ? 3 : 1;; tries--) { m = vm_page_alloc_contig_domain(object, pindex, domain, pflags, npages, low, high, alignment, boundary, memattr); if (m != NULL || tries == 0) break; VM_OBJECT_WUNLOCK(object); if (!vm_page_reclaim_contig_domain(domain, pflags, npages, low, high, alignment, boundary) && wait) vm_wait_domain(domain); VM_OBJECT_WLOCK(object); } return (m); } /* * Allocates a region from the kernel address map and physical pages * within the specified address range to the kernel object. Creates a * wired mapping from this region to these pages, and returns the * region's starting virtual address. The allocated pages are not * necessarily physically contiguous. If M_ZERO is specified through the * given flags, then the pages are zeroed before they are mapped. */ static vm_offset_t kmem_alloc_attr_domain(int domain, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { vmem_t *vmem; vm_object_t object; vm_offset_t addr, i, offset; vm_page_t m; int pflags; vm_prot_t prot; object = kernel_object; size = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW; VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += PAGE_SIZE) { m = kmem_alloc_contig_pages(object, atop(offset + i), domain, pflags, 1, low, high, PAGE_SIZE, 0, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); kmem_unback(object, addr, i); vmem_free(vmem, addr, size); return (0); } KASSERT(vm_phys_domain(m) == domain, ("kmem_alloc_attr_domain: Domain mismatch %d != %d", vm_phys_domain(m), domain)); if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, addr + i, m, prot, prot | PMAP_ENTER_WIRED, 0); } VM_OBJECT_WUNLOCK(object); return (addr); } vm_offset_t kmem_alloc_attr(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { return (kmem_alloc_attr_domainset(DOMAINSET_RR(), size, flags, low, high, memattr)); } vm_offset_t kmem_alloc_attr_domainset(struct domainset *ds, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_offset_t addr; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { addr = kmem_alloc_attr_domain(domain, size, flags, low, high, memattr); if (addr != 0) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (addr); } /* * Allocates a region from the kernel address map and physically * contiguous pages within the specified address range to the kernel * object. Creates a wired mapping from this region to these pages, and * returns the region's starting virtual address. If M_ZERO is specified * through the given flags, then the pages are zeroed before they are * mapped. */ static vm_offset_t kmem_alloc_contig_domain(int domain, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { vmem_t *vmem; vm_object_t object; vm_offset_t addr, offset, tmp; vm_page_t end_m, m; u_long npages; int pflags; object = kernel_object; size = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr)) return (0); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; npages = atop(size); VM_OBJECT_WLOCK(object); m = kmem_alloc_contig_pages(object, atop(offset), domain, pflags, npages, low, high, alignment, boundary, memattr); if (m == NULL) { VM_OBJECT_WUNLOCK(object); vmem_free(vmem, addr, size); return (0); } KASSERT(vm_phys_domain(m) == domain, ("kmem_alloc_contig_domain: Domain mismatch %d != %d", vm_phys_domain(m), domain)); end_m = m + npages; tmp = addr; for (; m < end_m; m++) { if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW, VM_PROT_RW | PMAP_ENTER_WIRED, 0); tmp += PAGE_SIZE; } VM_OBJECT_WUNLOCK(object); return (addr); } vm_offset_t kmem_alloc_contig(vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { return (kmem_alloc_contig_domainset(DOMAINSET_RR(), size, flags, low, high, alignment, boundary, memattr)); } vm_offset_t kmem_alloc_contig_domainset(struct domainset *ds, vm_size_t size, int flags, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_offset_t addr; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { addr = kmem_alloc_contig_domain(domain, size, flags, low, high, alignment, boundary, memattr); if (addr != 0) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (addr); } /* - * kmem_subinit: + * kmem_suballoc: * - * Initializes a map to manage a subrange + * Allocates a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * min, max Returned endpoints of map * size Size of range to find * superpage_align Request that min is superpage aligned */ -void -kmem_subinit(vm_map_t map, vm_map_t parent, vm_offset_t *min, vm_offset_t *max, - vm_size_t size, bool superpage_align) +vm_map_t +kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max, + vm_size_t size, boolean_t superpage_align) { int ret; + vm_map_t result; size = round_page(size); *min = vm_map_min(parent); ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ? VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_NO_CHARGE); if (ret != KERN_SUCCESS) - panic("kmem_subinit: bad status return of %d", ret); + panic("kmem_suballoc: bad status return of %d", ret); *max = *min + size; - vm_map_init(map, vm_map_pmap(parent), *min, *max); - if (vm_map_submap(parent, *min, *max, map) != KERN_SUCCESS) - panic("kmem_subinit: unable to change range to submap"); + result = vm_map_create(vm_map_pmap(parent), *min, *max); + if (result == NULL) + panic("kmem_suballoc: cannot create submap"); + if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS) + panic("kmem_suballoc: unable to change range to submap"); + return (result); } /* * kmem_malloc_domain: * * Allocate wired-down pages in the kernel's address space. */ static vm_offset_t kmem_malloc_domain(int domain, vm_size_t size, int flags) { vmem_t *arena; vm_offset_t addr; int rv; if (__predict_true((flags & M_EXEC) == 0)) arena = vm_dom[domain].vmd_kernel_arena; else arena = vm_dom[domain].vmd_kernel_rwx_arena; size = round_page(size); if (vmem_alloc(arena, size, flags | M_BESTFIT, &addr)) return (0); rv = kmem_back_domain(domain, kernel_object, addr, size, flags); if (rv != KERN_SUCCESS) { vmem_free(arena, addr, size); return (0); } return (addr); } vm_offset_t kmem_malloc(vm_size_t size, int flags) { return (kmem_malloc_domainset(DOMAINSET_RR(), size, flags)); } vm_offset_t kmem_malloc_domainset(struct domainset *ds, vm_size_t size, int flags) { struct vm_domainset_iter di; vm_offset_t addr; int domain; vm_domainset_iter_policy_init(&di, ds, &domain, &flags); do { addr = kmem_malloc_domain(domain, size, flags); if (addr != 0) break; } while (vm_domainset_iter_policy(&di, &domain) == 0); return (addr); } /* * kmem_back_domain: * * Allocate physical pages from the specified domain for the specified * virtual address range. */ int kmem_back_domain(int domain, vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t offset, i; vm_page_t m, mpred; vm_prot_t prot; int pflags; KASSERT(object == kernel_object, ("kmem_back_domain: only supports kernel object.")); offset = addr - VM_MIN_KERNEL_ADDRESS; pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED; pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if (flags & M_WAITOK) pflags |= VM_ALLOC_WAITFAIL; prot = (flags & M_EXEC) != 0 ? VM_PROT_ALL : VM_PROT_RW; i = 0; VM_OBJECT_WLOCK(object); retry: mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i)); for (; i < size; i += PAGE_SIZE, mpred = m) { m = vm_page_alloc_domain_after(object, atop(offset + i), domain, pflags, mpred); /* * Ran out of space, free everything up and return. Don't need * to lock page queues here as we know that the pages we got * aren't on any queues. */ if (m == NULL) { if ((flags & M_NOWAIT) == 0) goto retry; VM_OBJECT_WUNLOCK(object); kmem_unback(object, addr, i); return (KERN_NO_SPACE); } KASSERT(vm_phys_domain(m) == domain, ("kmem_back_domain: Domain mismatch %d != %d", vm_phys_domain(m), domain)); if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("kmem_malloc: page %p is managed", m)); m->valid = VM_PAGE_BITS_ALL; pmap_enter(kernel_pmap, addr + i, m, prot, prot | PMAP_ENTER_WIRED, 0); if (__predict_false((prot & VM_PROT_EXECUTE) != 0)) m->oflags |= VPO_KMEM_EXEC; } VM_OBJECT_WUNLOCK(object); return (KERN_SUCCESS); } /* * kmem_back: * * Allocate physical pages for the specified virtual address range. */ int kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags) { vm_offset_t end, next, start; int domain, rv; KASSERT(object == kernel_object, ("kmem_back: only supports kernel object.")); for (start = addr, end = addr + size; addr < end; addr = next) { /* * We must ensure that pages backing a given large virtual page * all come from the same physical domain. */ if (vm_ndomains > 1) { domain = (addr >> KVA_QUANTUM_SHIFT) % vm_ndomains; while (VM_DOMAIN_EMPTY(domain)) domain++; next = roundup2(addr + 1, KVA_QUANTUM); if (next > end || next < start) next = end; } else { domain = 0; next = end; } rv = kmem_back_domain(domain, object, addr, next - addr, flags); if (rv != KERN_SUCCESS) { kmem_unback(object, start, addr - start); break; } } return (rv); } /* * kmem_unback: * * Unmap and free the physical pages underlying the specified virtual * address range. * * A physical page must exist within the specified object at each index * that is being unmapped. */ static struct vmem * _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { struct vmem *arena; vm_page_t m, next; vm_offset_t end, offset; int domain; KASSERT(object == kernel_object, ("kmem_unback: only supports kernel object.")); if (size == 0) return (NULL); pmap_remove(kernel_pmap, addr, addr + size); offset = addr - VM_MIN_KERNEL_ADDRESS; end = offset + size; VM_OBJECT_WLOCK(object); m = vm_page_lookup(object, atop(offset)); domain = vm_phys_domain(m); if (__predict_true((m->oflags & VPO_KMEM_EXEC) == 0)) arena = vm_dom[domain].vmd_kernel_arena; else arena = vm_dom[domain].vmd_kernel_rwx_arena; for (; offset < end; offset += PAGE_SIZE, m = next) { next = vm_page_next(m); vm_page_unwire_noq(m); vm_page_free(m); } VM_OBJECT_WUNLOCK(object); return (arena); } void kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size) { (void)_kmem_unback(object, addr, size); } /* * kmem_free: * * Free memory allocated with kmem_malloc. The size must match the * original allocation. */ void kmem_free(vm_offset_t addr, vm_size_t size) { struct vmem *arena; size = round_page(size); arena = _kmem_unback(kernel_object, addr, size); if (arena != NULL) vmem_free(arena, addr, size); } /* * kmap_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * * This routine may block. */ vm_offset_t kmap_alloc_wait(vm_map_t map, vm_size_t size) { vm_offset_t addr; size = round_page(size); if (!swap_reserve(size)) return (0); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); addr = vm_map_findspace(map, vm_map_min(map), size); if (addr + size <= vm_map_max(map)) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); swap_release(size); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW, MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); } /* * kmap_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size) { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); } void kmem_init_zero_region(void) { vm_offset_t addr, i; vm_page_t m; /* * Map a single physical page of zeros to a larger virtual range. * This requires less looping in places that want large amounts of * zeros, while not using much more physical resources. */ addr = kva_alloc(ZERO_REGION_SIZE); m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE) pmap_qenter(addr + i, &m, 1); pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ); zero_region = (const void *)addr; } /* * Import KVA from the kernel map into the kernel arena. */ static int kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp) { vm_offset_t addr; int result; KASSERT((size % KVA_QUANTUM) == 0, ("kva_import: Size %jd is not a multiple of %d", (intmax_t)size, (int)KVA_QUANTUM)); addr = vm_map_min(kernel_map); result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0, VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); if (result != KERN_SUCCESS) return (ENOMEM); *addrp = addr; return (0); } /* * Import KVA from a parent arena into a per-domain arena. Imports must be * KVA_QUANTUM-aligned and a multiple of KVA_QUANTUM in size. */ static int kva_import_domain(void *arena, vmem_size_t size, int flags, vmem_addr_t *addrp) { KASSERT((size % KVA_QUANTUM) == 0, ("kva_import_domain: Size %jd is not a multiple of %d", (intmax_t)size, (int)KVA_QUANTUM)); return (vmem_xalloc(arena, size, KVA_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, addrp)); } /* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. * Create the kernel vmem arena and its per-domain children. */ void kmem_init(vm_offset_t start, vm_offset_t end) { + vm_map_t m; int domain; - vm_map_init(kernel_map, kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); - kernel_map->system_map = 1; - vm_map_lock(kernel_map); + m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); + m->system_map = 1; + vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ - (void) vm_map_insert(kernel_map, NULL, (vm_ooffset_t) 0, + kernel_map = m; + (void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, #ifdef __amd64__ KERNBASE, #else VM_MIN_KERNEL_ADDRESS, #endif start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ - vm_map_unlock(kernel_map); + vm_map_unlock(m); /* * Initialize the kernel_arena. This can grow on demand. */ vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0); vmem_set_import(kernel_arena, kva_import, NULL, NULL, KVA_QUANTUM); for (domain = 0; domain < vm_ndomains; domain++) { /* * Initialize the per-domain arenas. These are used to color * the KVA space in a way that ensures that virtual large pages * are backed by memory from the same physical domain, * maximizing the potential for superpage promotion. */ vm_dom[domain].vmd_kernel_arena = vmem_create( "kernel arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_arena, kva_import_domain, NULL, kernel_arena, KVA_QUANTUM); /* * In architectures with superpages, maintain separate arenas * for allocations with permissions that differ from the * "standard" read/write permissions used for kernel memory, * so as not to inhibit superpage promotion. */ #if VM_NRESERVLEVEL > 0 vm_dom[domain].vmd_kernel_rwx_arena = vmem_create( "kernel rwx arena domain", 0, 0, PAGE_SIZE, 0, M_WAITOK); vmem_set_import(vm_dom[domain].vmd_kernel_rwx_arena, kva_import_domain, (vmem_release_t *)vmem_xfree, kernel_arena, KVA_QUANTUM); #else vm_dom[domain].vmd_kernel_rwx_arena = vm_dom[domain].vmd_kernel_arena; #endif } } /* * kmem_bootstrap_free: * * Free pages backing preloaded data (e.g., kernel modules) to the * system. Currently only supported on platforms that create a * vm_phys segment for preloaded data. */ void kmem_bootstrap_free(vm_offset_t start, vm_size_t size) { #if defined(__i386__) || defined(__amd64__) struct vm_domain *vmd; vm_offset_t end, va; vm_paddr_t pa; vm_page_t m; end = trunc_page(start + size); start = round_page(start); for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_kextract(va); m = PHYS_TO_VM_PAGE(pa); vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); vm_cnt.v_page_count++; } pmap_remove(kernel_pmap, start, end); (void)vmem_add(kernel_arena, start, end - start, M_WAITOK); #endif } /* * Allow userspace to directly trigger the VM drain routine for testing * purposes. */ static int debug_vm_lowmem(SYSCTL_HANDLER_ARGS) { int error, i; i = 0; error = sysctl_handle_int(oidp, &i, 0, req); if (error) return (error); if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0) return (EINVAL); if (i != 0) EVENTHANDLER_INVOKE(vm_lowmem, i); return (0); } SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0, debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags"); Index: stable/12/sys/vm/vm_kern.h =================================================================== --- stable/12/sys/vm/vm_kern.h (revision 365006) +++ stable/12/sys/vm/vm_kern.h (revision 365007) @@ -1,85 +1,82 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_KERN_H_ #define _VM_VM_KERN_H_ /* Kernel memory management definitions. */ -extern struct vm_map kernel_map_store; -#define kernel_map (&kernel_map_store) -extern struct vm_map exec_map_store; -#define exec_map (&exec_map_store) -extern struct vm_map pipe_map_store; -#define pipe_map (&pipe_map_store) +extern vm_map_t kernel_map; +extern vm_map_t exec_map; +extern vm_map_t pipe_map; extern struct vmem *kernel_arena; extern struct vmem *kmem_arena; extern struct vmem *buffer_arena; extern struct vmem *transient_arena; extern struct vmem *memguard_arena; extern vm_offset_t swapbkva; extern u_long vm_kmem_size; extern u_int exec_map_entries; extern u_int exec_map_entry_size; #endif /* _VM_VM_KERN_H_ */ Index: stable/12/sys/vm/vm_map.c =================================================================== --- stable/12/sys/vm/vm_map.c (revision 365006) +++ stable/12/sys/vm/vm_map.c (revision 365007) @@ -1,4914 +1,4963 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a self-adjusting binary search tree of these * entries is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; +static uma_zone_t mapzone; static uma_zone_t vmspace_zone; static int vmspace_zinit(void *mem, int size, int flags); +static int vm_map_zinit(void *mem, int ize, int flags); static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max); static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry); static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); #ifdef INVARIANTS +static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); #endif static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow); static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr); #define ENTRY_CHARGED(e) ((e)->cred != NULL || \ ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) /* * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type * stable. */ #define PROC_VMSPACE_LOCK(p) do { } while (0) #define PROC_VMSPACE_UNLOCK(p) do { } while (0) /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); + mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, +#ifdef INVARIANTS + vm_map_zdtor, +#else + NULL, +#endif + vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_prealloc(mapzone, MAX_KMAP); kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } static int vmspace_zinit(void *mem, int size, int flags) { struct vmspace *vm; - vm_map_t map; vm = (struct vmspace *)mem; - map = &vm->vm_map; + vm->vm_map.pmap = NULL; + (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags); + PMAP_LOCK_INIT(vmspace_pmap(vm)); + return (0); +} + +static int +vm_map_zinit(void *mem, int size, int flags) +{ + vm_map_t map; + + map = (vm_map_t)mem; memset(map, 0, sizeof(*map)); - mtx_init(&map->system_mtx, "vm map (system)", NULL, - MTX_DEF | MTX_DUPOK); + mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); sx_init(&map->lock, "vm map (user)"); - PMAP_LOCK_INIT(vmspace_pmap(vm)); return (0); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; - KASSERT(vm->vm_map.nentries == 0, - ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries)); - KASSERT(vm->vm_map.size == 0, - ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size)); + + vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); } +static void +vm_map_zdtor(void *mem, int size, void *arg) +{ + vm_map_t map; + + map = (vm_map_t)mem; + KASSERT(map->nentries == 0, + ("map %p nentries == %d on free.", + map, map->nentries)); + KASSERT(map->size == 0, + ("map %p size == %lu on free.", + map, (unsigned long)map->size)); +} #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. + * + * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit(). */ struct vmspace * vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); if (!pinit(vmspace_pmap(vm))) { uma_zfree(vmspace_zone, vm); return (NULL); } CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); vm->vm_refcnt = 1; vm->vm_shm = NULL; vm->vm_swrss = 0; vm->vm_tsize = 0; vm->vm_dsize = 0; vm->vm_ssize = 0; vm->vm_taddr = 0; vm->vm_daddr = 0; vm->vm_maxsaddr = 0; return (vm); } #ifdef RACCT static void vmspace_container_reset(struct proc *p) { PROC_LOCK(p); racct_set(p, RACCT_DATA, 0); racct_set(p, RACCT_STACK, 0); racct_set(p, RACCT_RSS, 0); racct_set(p, RACCT_MEMLOCK, 0); racct_set(p, RACCT_VMEM, 0); PROC_UNLOCK(p); } #endif static inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); pmap_release(vmspace_pmap(vm)); vm->vm_map.pmap = NULL; uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmspace_free() called"); if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; p->p_vmspace = NULL; PROC_VMSPACE_UNLOCK(p); KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); vmspace_free(vm); } void vmspace_exit(struct thread *td) { int refcnt; struct vmspace *vm; struct proc *p; /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * * The last exiting process to reach this point releases as * much of the environment as it can. vmspace_dofree() is the * slower fallback in case another process had a temporary * reference to the vmspace. */ p = td->td_proc; vm = p->p_vmspace; atomic_add_int(&vmspace0.vm_refcnt, 1); do { refcnt = vm->vm_refcnt; if (refcnt > 1 && p->p_vmspace != &vmspace0) { /* Switch now since other proc might free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); if (refcnt == 1) { if (p->p_vmspace != vm) { /* vmspace not yet freed, switch back */ PROC_VMSPACE_LOCK(p); p->p_vmspace = vm; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); } pmap_remove_pages(vmspace_pmap(vm)); /* Switch now since this proc will free vmspace */ PROC_VMSPACE_LOCK(p); p->p_vmspace = &vmspace0; PROC_VMSPACE_UNLOCK(p); pmap_activate(td); vmspace_dofree(vm); } #ifdef RACCT if (racct_enable) vmspace_container_reset(p); #endif } /* Acquire reference to vmspace owned by another process. */ struct vmspace * vmspace_acquire_ref(struct proc *p) { struct vmspace *vm; int refcnt; PROC_VMSPACE_LOCK(p); vm = p->p_vmspace; if (vm == NULL) { PROC_VMSPACE_UNLOCK(p); return (NULL); } do { refcnt = vm->vm_refcnt; if (refcnt <= 0) { /* Avoid 0->1 transition */ PROC_VMSPACE_UNLOCK(p); return (NULL); } } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1)); if (vm != p->p_vmspace) { PROC_VMSPACE_UNLOCK(p); vmspace_free(vm); return (NULL); } PROC_VMSPACE_UNLOCK(p); return (vm); } /* * Switch between vmspaces in an AIO kernel process. * * The new vmspace is either the vmspace of a user process obtained * from an active AIO request or the initial vmspace of the AIO kernel * process (when it is idling). Because user processes will block to * drain any active AIO requests before proceeding in exit() or * execve(), the reference count for vmspaces from AIO requests can * never be 0. Similarly, AIO kernel processes hold an extra * reference on their initial vmspace for the life of the process. As * a result, the 'newvm' vmspace always has a non-zero reference * count. This permits an additional reference on 'newvm' to be * acquired via a simple atomic increment rather than the loop in * vmspace_acquire_ref() above. */ void vmspace_switch_aio(struct vmspace *newvm) { struct vmspace *oldvm; /* XXX: Need some way to assert that this is an aio daemon. */ KASSERT(newvm->vm_refcnt > 0, ("vmspace_switch_aio: newvm unreferenced")); oldvm = curproc->p_vmspace; if (oldvm == newvm) return; /* * Point to the new address space and refer to it. */ curproc->p_vmspace = newvm; atomic_add_int(&newvm->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(curthread); vmspace_free(oldvm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_xlock_(&map->lock, file, line); map->timestamp++; } void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add) { vm_object_t object, object1; struct vnode *vp; if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0) return; KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with execs")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for text, entry %p", entry)); VM_OBJECT_RLOCK(object); while ((object1 = object->backing_object) != NULL) { VM_OBJECT_RLOCK(object1); VM_OBJECT_RUNLOCK(object); object = object1; } vp = NULL; if (object->type == OBJT_DEAD) { /* * For OBJT_DEAD objects, v_writecount was handled in * vnode_pager_dealloc(). */ } else if (object->type == OBJT_VNODE) { vp = object->handle; } else if (object->type == OBJT_SWAP) { KASSERT((object->flags & OBJ_TMPFS_NODE) != 0, ("vm_map_entry_set_vnode_text: swap and !TMPFS " "entry %p, object %p, add %d", entry, object, add)); /* * Tmpfs VREG node, which was reclaimed, has * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS. In * this case there is no v_writecount to adjust. */ if ((object->flags & OBJ_TMPFS) != 0) vp = object->un_pager.swp.swp_tmpfs; } else { KASSERT(0, ("vm_map_entry_set_vnode_text: wrong object type, " "entry %p, object %p, add %d", entry, object, add)); } if (vp != NULL) { if (add) { VOP_SET_TEXT_CHECKED(vp); VM_OBJECT_RUNLOCK(object); } else { vhold(vp); VM_OBJECT_RUNLOCK(object); vn_lock(vp, LK_SHARED | LK_RETRY); VOP_UNSET_TEXT_CHECKED(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } } else { VM_OBJECT_RUNLOCK(object); } } static void vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry, next; vm_object_t object; td = curthread; entry = td->td_map_def_user; td->td_map_def_user = NULL; while (entry != NULL) { next = entry->next; MPASS((entry->eflags & (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC)); if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) { /* * Decrement the object's writemappings and * possibly the vnode's v_writecount. */ KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap with writecount")); object = entry->object.vm_object; KASSERT(object != NULL, ("No object for writecount")); vm_pager_release_writecount(object, entry->start, entry->end); } vm_map_entry_set_vnode_text(entry, false); vm_map_entry_deallocate(entry, FALSE); entry = next; } } void _vm_map_unlock(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_xunlock_(&map->lock, file, line); vm_map_process_deferred(); } } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_lock_flags_(&map->system_mtx, 0, file, line); else sx_slock_(&map->lock, file, line); } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else { sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); } } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_xlock_(&map->lock, file, line); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : !sx_try_slock_(&map->lock, file, line); return (error == 0); } /* * _vm_map_lock_upgrade: [ internal use only ] * * Tries to upgrade a read (shared) lock on the specified map to a write * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a * non-zero value if the upgrade fails. If the upgrade fails, the map is * returned without a read or write lock held. * * Requires that the map be read locked. */ int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { unsigned int last_timestamp; if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else { if (!sx_try_upgrade_(&map->lock, file, line)) { last_timestamp = map->timestamp; sx_sunlock_(&map->lock, file, line); vm_map_process_deferred(); /* * If the map's timestamp does not change while the * map is unlocked, then the upgrade succeeds. */ sx_xlock_(&map->lock, file, line); if (last_timestamp != map->timestamp) { sx_xunlock_(&map->lock, file, line); return (1); } } } map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { mtx_assert_(&map->system_mtx, MA_OWNED, file, line); } else sx_downgrade_(&map->lock, file, line); } /* * vm_map_locked: * * Returns a non-zero value if the caller holds a write (exclusive) lock * on the specified map and the value "0" otherwise. */ int vm_map_locked(vm_map_t map) { if (map->system_map) return (mtx_owned(&map->system_mtx)); else return (sx_xlocked(&map->lock)); } #ifdef INVARIANTS static void _vm_map_assert_locked(vm_map_t map, const char *file, int line) { if (map->system_map) mtx_assert_(&map->system_mtx, MA_OWNED, file, line); else sx_assert_(&map->lock, SA_XLOCKED, file, line); } #define VM_MAP_ASSERT_LOCKED(map) \ _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) #ifdef DIAGNOSTIC static int enable_vmmap_check = 1; #else static int enable_vmmap_check = 0; #endif SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN, &enable_vmmap_check, 0, "Enable vm map consistency checking"); static void _vm_map_assert_consistent(vm_map_t map) { vm_map_entry_t entry; vm_map_entry_t child; vm_size_t max_left, max_right; if (!enable_vmmap_check) return; for (entry = map->header.next; entry != &map->header; entry = entry->next) { KASSERT(entry->prev->end <= entry->start, ("map %p prev->end = %jx, start = %jx", map, (uintmax_t)entry->prev->end, (uintmax_t)entry->start)); KASSERT(entry->start < entry->end, ("map %p start = %jx, end = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->end)); KASSERT(entry->end <= entry->next->start, ("map %p end = %jx, next->start = %jx", map, (uintmax_t)entry->end, (uintmax_t)entry->next->start)); KASSERT(entry->left == NULL || entry->left->start < entry->start, ("map %p left->start = %jx, start = %jx", map, (uintmax_t)entry->left->start, (uintmax_t)entry->start)); KASSERT(entry->right == NULL || entry->start < entry->right->start, ("map %p start = %jx, right->start = %jx", map, (uintmax_t)entry->start, (uintmax_t)entry->right->start)); child = entry->left; max_left = (child != NULL) ? child->max_free : entry->start - entry->prev->end; child = entry->right; max_right = (child != NULL) ? child->max_free : entry->next->start - entry->end; KASSERT(entry->max_free == MAX(max_left, max_right), ("map %p max = %jx, max_left = %jx, max_right = %jx", map, (uintmax_t)entry->max_free, (uintmax_t)max_left, (uintmax_t)max_right)); } } #define VM_MAP_ASSERT_CONSISTENT(map) \ _vm_map_assert_consistent(map) #else #define VM_MAP_ASSERT_LOCKED(map) #define VM_MAP_ASSERT_CONSISTENT(map) #endif /* INVARIANTS */ /* * _vm_map_unlock_and_wait: * * Atomically releases the lock on the specified map and puts the calling * thread to sleep. The calling thread will remain asleep until either * vm_map_wakeup() is performed on the map or the specified timeout is * exceeded. * * WARNING! This function does not perform deferred deallocations of * objects and map entries. Therefore, the calling thread is expected to * reacquire the map lock after reawakening and later perform an ordinary * unlock operation, such as vm_map_unlock(), before completing its * operation on the map. */ int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) { mtx_lock(&map_sleep_mtx); if (map->system_map) mtx_unlock_flags_(&map->system_mtx, 0, file, line); else sx_xunlock_(&map->lock, file, line); return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", timo)); } /* * vm_map_wakeup: * * Awaken any threads that have slept on the map using * vm_map_unlock_and_wait(). */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the map unlock * and the msleep() in _vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } void vm_map_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); map->busy++; } void vm_map_unbusy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); KASSERT(map->busy, ("vm_map_unbusy: not busy")); if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); wakeup(&map->busy); } } void vm_map_wait_busy(vm_map_t map) { VM_MAP_ASSERT_LOCKED(map); while (map->busy) { vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); if (map->system_map) msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); else sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); } map->timestamp++; } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } /* + * vm_map_create: + * + * Creates and returns a new empty VM map with + * the given physical map structure, and having + * the given lower and upper address bounds. + */ +vm_map_t +vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) +{ + vm_map_t result; + + result = uma_zalloc(mapzone, M_WAITOK); + CTR1(KTR_VM, "vm_map_create: %p", result); + _vm_map_init(result, pmap, min, max); + return (result); +} + +/* * Initialize an existing vm_map structure * such as that in the vmspace structure. */ static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { map->header.next = map->header.prev = &map->header; map->header.eflags = MAP_ENTRY_HEADER; map->needs_wakeup = FALSE; map->system_map = 0; map->pmap = pmap; map->header.end = min; map->header.start = max; map->flags = 0; map->root = NULL; map->timestamp = 0; map->busy = 0; map->anon_loc = 0; } void vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, pmap, min, max); - mtx_init(&map->system_mtx, "vm map (system)", NULL, - MTX_DEF | MTX_DUPOK); - sx_init(&map->lock, "vm map (user)"); + mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); + sx_init(&map->lock, "user map"); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; if (map->system_map) new_entry = uma_zalloc(kmapentzone, M_NOWAIT); else new_entry = uma_zalloc(mapentzone, M_WAITOK); if (new_entry == NULL) panic("vm_map_entry_create: kernel resources exhausted"); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_max_free_{left,right}: * * Compute the size of the largest free gap between two entries, * one the root of a tree and the other the ancestor of that root * that is the least or greatest ancestor found on the search path. */ static inline vm_size_t vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor) { return (root->left != NULL ? root->left->max_free : root->start - left_ancestor->end); } static inline vm_size_t vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor) { return (root->right != NULL ? root->right->max_free : right_ancestor->start - root->end); } #define SPLAY_LEFT_STEP(root, y, rlist, test) do { \ vm_size_t max_free; \ \ /* \ * Infer root->right->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look right to find it. \ */ \ y = root->left; \ max_free = root->max_free; \ KASSERT(max_free >= vm_map_entry_max_free_right(root, rlist), \ ("%s: max_free invariant fails", __func__)); \ if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free) \ max_free = vm_map_entry_max_free_right(root, rlist); \ if (y != NULL && (test)) { \ /* Rotate right and make y root. */ \ root->left = y->right; \ y->right = root; \ if (max_free < y->max_free) \ root->max_free = max_free = MAX(max_free, \ vm_map_entry_max_free_left(root, y)); \ root = y; \ y = root->left; \ } \ /* Copy right->max_free. Put root on rlist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \ ("%s: max_free not copied from right", __func__)); \ root->left = rlist; \ rlist = root; \ root = y; \ } while (0) #define SPLAY_RIGHT_STEP(root, y, llist, test) do { \ vm_size_t max_free; \ \ /* \ * Infer root->left->max_free == root->max_free when \ * y->max_free < root->max_free || root->max_free == 0. \ * Otherwise, look left to find it. \ */ \ y = root->right; \ max_free = root->max_free; \ KASSERT(max_free >= vm_map_entry_max_free_left(root, llist), \ ("%s: max_free invariant fails", __func__)); \ if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free) \ max_free = vm_map_entry_max_free_left(root, llist); \ if (y != NULL && (test)) { \ /* Rotate left and make y root. */ \ root->right = y->left; \ y->left = root; \ if (max_free < y->max_free) \ root->max_free = max_free = MAX(max_free, \ vm_map_entry_max_free_right(root, y)); \ root = y; \ y = root->right; \ } \ /* Copy left->max_free. Put root on llist. */ \ root->max_free = max_free; \ KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \ ("%s: max_free not copied from left", __func__)); \ root->right = llist; \ llist = root; \ root = y; \ } while (0) /* * Walk down the tree until we find addr or a NULL pointer where addr would go, * breaking off left and right subtrees of nodes less than, or greater than * addr. Treat pointers to nodes with max_free < length as NULL pointers. * llist and rlist are the two sides in reverse order (bottom-up), with llist * linked by the right pointer and rlist linked by the left pointer in the * vm_map_entry, and both lists terminated by &map->header. This function, and * the subsequent call to vm_map_splay_merge, rely on the start and end address * values in &map->header. */ static vm_map_entry_t vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length, vm_map_entry_t *out_llist, vm_map_entry_t *out_rlist) { vm_map_entry_t llist, rlist, root, y; llist = rlist = &map->header; root = map->root; while (root != NULL && root->max_free >= length) { KASSERT(llist->end <= root->start && root->end <= rlist->start, ("%s: root not within tree bounds", __func__)); if (addr < root->start) { SPLAY_LEFT_STEP(root, y, rlist, y->max_free >= length && addr < y->start); } else if (addr >= root->end) { SPLAY_RIGHT_STEP(root, y, llist, y->max_free >= length && addr >= y->end); } else break; } *out_llist = llist; *out_rlist = rlist; return (root); } static void vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *iolist) { vm_map_entry_t rlist, y; root = root->right; rlist = *iolist; while (root != NULL) SPLAY_LEFT_STEP(root, y, rlist, true); *iolist = rlist; } static void vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *iolist) { vm_map_entry_t llist, y; root = root->left; llist = *iolist; while (root != NULL) SPLAY_RIGHT_STEP(root, y, llist, true); *iolist = llist; } static inline void vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b) { vm_map_entry_t tmp; tmp = *b; *b = *a; *a = tmp; } /* * Walk back up the two spines, flip the pointers and set max_free. The * subtrees of the root go at the bottom of llist and rlist. */ static void vm_map_splay_merge(vm_map_t map, vm_map_entry_t root, vm_map_entry_t llist, vm_map_entry_t rlist) { vm_map_entry_t prev; vm_size_t max_free_left, max_free_right; max_free_left = vm_map_entry_max_free_left(root, llist); if (llist != &map->header) { prev = root->left; do { /* * The max_free values of the children of llist are in * llist->max_free and max_free_left. Update with the * max value. */ llist->max_free = max_free_left = MAX(llist->max_free, max_free_left); vm_map_entry_swap(&llist->right, &prev); vm_map_entry_swap(&prev, &llist); } while (llist != &map->header); root->left = prev; } max_free_right = vm_map_entry_max_free_right(root, rlist); if (rlist != &map->header) { prev = root->right; do { /* * The max_free values of the children of rlist are in * rlist->max_free and max_free_right. Update with the * max value. */ rlist->max_free = max_free_right = MAX(rlist->max_free, max_free_right); vm_map_entry_swap(&rlist->left, &prev); vm_map_entry_swap(&prev, &rlist); } while (rlist != &map->header); root->right = prev; } root->max_free = MAX(max_free_left, max_free_right); map->root = root; } /* * vm_map_splay: * * The Sleator and Tarjan top-down splay algorithm with the * following variation. Max_free must be computed bottom-up, so * on the downward pass, maintain the left and right spines in * reverse order. Then, make a second pass up each side to fix * the pointers and compute max_free. The time bound is O(log n) * amortized. * * The new root is the vm_map_entry containing "addr", or else an * adjacent entry (lower if possible) if addr is not in the tree. * * The map must be locked, and leaves it so. * * Returns: the new root. */ static vm_map_entry_t vm_map_splay(vm_map_t map, vm_offset_t addr) { vm_map_entry_t llist, rlist, root; root = vm_map_splay_split(map, addr, 0, &llist, &rlist); if (root != NULL) { /* do nothing */ } else if (llist != &map->header) { /* * Recover the greatest node in the left * subtree and make it the root. */ root = llist; llist = root->right; root->right = NULL; } else if (rlist != &map->header) { /* * Recover the least node in the right * subtree and make it the root. */ root = rlist; rlist = root->left; root->left = NULL; } else { /* There is no root. */ return (NULL); } vm_map_splay_merge(map, root, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t llist, rlist, root; CTR3(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p", map, map->nentries, entry); VM_MAP_ASSERT_LOCKED(map); map->nentries++; root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root == NULL, ("vm_map_entry_link: link object already mapped")); entry->prev = llist; entry->next = rlist; llist->next = rlist->prev = entry; entry->left = entry->right = NULL; vm_map_splay_merge(map, entry, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); } enum unlink_merge_type { UNLINK_MERGE_PREV, UNLINK_MERGE_NONE, UNLINK_MERGE_NEXT }; static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry, enum unlink_merge_type op) { vm_map_entry_t llist, rlist, root, y; VM_MAP_ASSERT_LOCKED(map); root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("vm_map_entry_unlink: unlink object not mapped")); switch (op) { case UNLINK_MERGE_PREV: vm_map_splay_findprev(root, &llist); llist->end = root->end; y = root->right; root = llist; llist = root->right; root->right = y; break; case UNLINK_MERGE_NEXT: vm_map_splay_findnext(root, &rlist); rlist->start = root->start; rlist->offset = root->offset; y = root->left; root = rlist; rlist = root->left; root->left = y; break; case UNLINK_MERGE_NONE: vm_map_splay_findprev(root, &llist); vm_map_splay_findnext(root, &rlist); if (llist != &map->header) { root = llist; llist = root->right; root->right = NULL; } else if (rlist != &map->header) { root = rlist; rlist = root->left; root->left = NULL; } else root = NULL; break; } y = entry->next; y->prev = entry->prev; y->prev->next = y; if (root != NULL) vm_map_splay_merge(map, root, llist, rlist); else map->root = NULL; VM_MAP_ASSERT_CONSISTENT(map); map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_entry_resize_free: * * Recompute the amount of free space following a modified vm_map_entry * and propagate those values up the tree. Call this function after * resizing a map entry in-place by changing the end value, without a * call to vm_map_entry_link() or _unlink(). * * The map must be locked, and leaves it so. */ static void vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t llist, rlist, root; VM_MAP_ASSERT_LOCKED(map); root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist); KASSERT(root != NULL, ("vm_map_entry_resize_free: resize_free object not mapped")); vm_map_splay_findnext(root, &rlist); root->right = NULL; vm_map_splay_merge(map, root, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); CTR3(KTR_VM, "vm_map_entry_resize_free: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur, lbound; boolean_t locked; /* * If the map is empty, then the map entry immediately preceding * "address" is the map's header. */ cur = map->root; if (cur == NULL) { *entry = &map->header; return (FALSE); } if (address >= cur->start && cur->end > address) { *entry = cur; return (TRUE); } if ((locked = vm_map_locked(map)) || sx_try_upgrade(&map->lock)) { /* * Splay requires a write lock on the map. However, it only * restructures the binary search tree; it does not otherwise * change the map. Thus, the map's timestamp need not change * on a temporary upgrade. */ cur = vm_map_splay(map, address); if (!locked) sx_downgrade(&map->lock); /* * If "address" is contained within a map entry, the new root * is that map entry. Otherwise, the new root is a map entry * immediately before or after "address". */ if (address < cur->start) { *entry = &map->header; return (FALSE); } *entry = cur; return (address < cur->end); } /* * Since the map is only locked for read access, perform a * standard binary search tree lookup for "address". */ lbound = &map->header; do { if (address < cur->start) { cur = cur->left; } else if (cur->end <= address) { lbound = cur; cur = cur->right; } else { *entry = cur; return (TRUE); } } while (cur != NULL); *entry = lbound; return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry, temp_entry; struct ucred *cred; vm_eflags_t protoeflags; vm_inherit_t inheritance; VM_MAP_ASSERT_LOCKED(map); KASSERT(object != kernel_object || (cow & MAP_COPY_ON_WRITE) == 0, ("vm_map_insert: kernel object and COW")); KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, ("vm_map_insert: paradoxical MAP_NOFAULT request")); KASSERT((prot & ~max) == 0, ("prot %#x is not subset of max_prot %#x", prot, max)); /* * Check that the start and end points are not bogus. */ if (start == end || !vm_map_range_valid(map, start, end)) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &temp_entry)) return (KERN_NO_SPACE); prev_entry = temp_entry; /* * Assert that the next entry doesn't overlap the end point. */ if (prev_entry->next->start < end) return (KERN_NO_SPACE); if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || max != VM_PROT_NONE)) return (KERN_INVALID_ARGUMENT); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) protoeflags |= MAP_ENTRY_NOFAULT; if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (cow & MAP_STACK_GROWS_DOWN) protoeflags |= MAP_ENTRY_GROWS_DOWN; if (cow & MAP_STACK_GROWS_UP) protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_WRITECOUNT) protoeflags |= MAP_ENTRY_WRITECNT; if (cow & MAP_VN_EXEC) protoeflags |= MAP_ENTRY_VN_EXEC; if ((cow & MAP_CREATE_GUARD) != 0) protoeflags |= MAP_ENTRY_GUARD; if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_DN; if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) protoeflags |= MAP_ENTRY_STACK_GAP_UP; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; cred = NULL; if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) goto charged; if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) return (KERN_RESOURCE_SHORTAGE); KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || object->cred == NULL, ("overcommit: vm_map_insert o %p", object)); cred = curthread->td_ucred; } charged: /* Expand the kernel pmap, if necessary. */ if (map == kernel_map && end > kernel_vm_end) pmap_growkernel(end); if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ VM_OBJECT_WLOCK(object); if (object->ref_count > 1 || object->shadow_count != 0) vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) == protoeflags && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP | MAP_VN_EXEC)) == 0 && prev_entry->end == start && (prev_entry->cred == cred || (prev_entry->object.vm_object != NULL && prev_entry->object.vm_object->cred == cred)) && vm_object_coalesce(prev_entry->object.vm_object, prev_entry->offset, (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end), cred != NULL && (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if (prev_entry->inheritance == inheritance && prev_entry->protection == prot && prev_entry->max_protection == max && prev_entry->wired_count == 0) { KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) == 0, ("prev_entry %p has incoherent wiring", prev_entry)); if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += end - prev_entry->end; prev_entry->end = end; vm_map_entry_resize_free(map, prev_entry); vm_map_simplify_entry(map, prev_entry); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); if (cred != NULL && object != NULL && object->cred != NULL && !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { /* Object already accounts for this uid. */ cred = NULL; } } if (cred != NULL) crhold(cred); /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->cred = NULL; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->inheritance = inheritance; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; new_entry->wiring_thread = NULL; new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; new_entry->next_read = start; KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); new_entry->cred = cred; /* * Insert the new entry into the list */ vm_map_entry_link(map, new_entry); if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) map->size += new_entry->end - new_entry->start; /* * Try to coalesce the new entry with both the previous and next * entries in the list. Previously, we only attempted to coalesce * with the previous entry when object is NULL. Here, we handle the * other cases, which are less common. */ vm_map_simplify_entry(map, new_entry); if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * vm_map_findspace: * * Find the first fit (lowest VM address) for "length" free bytes * beginning at address >= start in the given map. * * In a vm_map_entry, "max_free" is the maximum amount of * contiguous free space between an entry in its subtree and a * neighbor of that entry. This allows finding a free region in * one path down the tree, so O(log n) amortized with splay * trees. * * The map must be locked, and leaves it so. * * Returns: starting address if sufficient space, * vm_map_max(map)-length+1 if insufficient space. */ vm_offset_t vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length) { vm_map_entry_t llist, rlist, root, y; vm_size_t left_length; vm_offset_t gap_end; /* * Request must fit within min/max VM address and must avoid * address wrap. */ start = MAX(start, vm_map_min(map)); if (start >= vm_map_max(map) || length > vm_map_max(map) - start) return (vm_map_max(map) - length + 1); /* Empty tree means wide open address space. */ if (map->root == NULL) return (start); /* * After splay_split, if start is within an entry, push it to the start * of the following gap. If rlist is at the end of the gap containing * start, save the end of that gap in gap_end to see if the gap is big * enough; otherwise set gap_end to start skip gap-checking and move * directly to a search of the right subtree. */ root = vm_map_splay_split(map, start, length, &llist, &rlist); gap_end = rlist->start; if (root != NULL) { start = root->end; if (root->right != NULL) gap_end = start; } else if (rlist != &map->header) { root = rlist; rlist = root->left; root->left = NULL; } else { root = llist; llist = root->right; root->right = NULL; } vm_map_splay_merge(map, root, llist, rlist); VM_MAP_ASSERT_CONSISTENT(map); if (length <= gap_end - start) return (start); /* With max_free, can immediately tell if no solution. */ if (root->right == NULL || length > root->right->max_free) return (vm_map_max(map) - length + 1); /* * Splay for the least large-enough gap in the right subtree. */ llist = rlist = &map->header; for (left_length = 0;; left_length = vm_map_entry_max_free_left(root, llist)) { if (length <= left_length) SPLAY_LEFT_STEP(root, y, rlist, length <= vm_map_entry_max_free_left(y, llist)); else SPLAY_RIGHT_STEP(root, y, llist, length > vm_map_entry_max_free_left(y, root)); if (root == NULL) break; } root = llist; llist = root->right; root->right = NULL; if (rlist != &map->header) { y = rlist; rlist = y->left; y->left = NULL; vm_map_splay_merge(map, y, &map->header, rlist); y->max_free = MAX( vm_map_entry_max_free_left(y, root), vm_map_entry_max_free_right(y, &map->header)); root->right = y; } vm_map_splay_merge(map, root, llist, &map->header); VM_MAP_ASSERT_CONSISTENT(map); return (root->end); } int vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_size_t length, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t end; int result; end = start + length; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_fixed: non-NULL backing object for stack")); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if ((cow & MAP_CHECK_EXCL) == 0) vm_map_delete(map, start, end); if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { result = vm_map_stack_locked(map, start, length, sgrowsiz, prot, max, cow); } else { result = vm_map_insert(map, object, offset, start, end, prot, max, cow); } vm_map_unlock(map); return (result); } static const int aslr_pages_rnd_64[2] = {0x1000, 0x10}; static const int aslr_pages_rnd_32[2] = {0x100, 0x4}; static int cluster_anon = 1; SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW, &cluster_anon, 0, "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always"); static bool clustering_anon_allowed(vm_offset_t addr) { switch (cluster_anon) { case 0: return (false); case 1: return (addr == 0); case 2: default: return (true); } } static long aslr_restarts; SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD, &aslr_restarts, 0, "Number of aslr failures"); #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) /* * Searches for the specified amount of free space in the given map with the * specified alignment. Performs an address-ordered, first-fit search from * the given address "*addr", with an optional upper bound "max_addr". If the * parameter "alignment" is zero, then the alignment is computed from the * given (object, offset) pair so as to enable the greatest possible use of * superpage mappings. Returns KERN_SUCCESS and the address of the free space * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE. * * The map must be locked. Initially, there must be at least "length" bytes * of free space at the given address. */ static int vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr, vm_offset_t alignment) { vm_offset_t aligned_addr, free_addr; VM_MAP_ASSERT_LOCKED(map); free_addr = *addr; KASSERT(free_addr == vm_map_findspace(map, free_addr, length), ("caller failed to provide space %#jx at address %p", (uintmax_t)length, (void *)free_addr)); for (;;) { /* * At the start of every iteration, the free space at address * "*addr" is at least "length" bytes. */ if (alignment == 0) pmap_align_superpage(object, offset, addr, length); else if ((*addr & (alignment - 1)) != 0) { *addr &= ~(alignment - 1); *addr += alignment; } aligned_addr = *addr; if (aligned_addr == free_addr) { /* * Alignment did not change "*addr", so "*addr" must * still provide sufficient free space. */ return (KERN_SUCCESS); } /* * Test for address wrap on "*addr". A wrapped "*addr" could * be a valid address, in which case vm_map_findspace() cannot * be relied upon to fail. */ if (aligned_addr < free_addr) return (KERN_NO_SPACE); *addr = vm_map_findspace(map, aligned_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) return (KERN_NO_SPACE); free_addr = *addr; if (free_addr == aligned_addr) { /* * If a successful call to vm_map_findspace() did not * change "*addr", then "*addr" must still be aligned * and provide sufficient free space. */ return (KERN_SUCCESS); } } } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t alignment, curr_min_addr, min_addr; int gap, pidx, rv, try; bool cluster, en_aslr, update_anon; KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || object == NULL, ("vm_map_find: non-NULL backing object for stack")); MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE && (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)); if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || (object->flags & OBJ_COLORED) == 0)) find_space = VMFS_ANY_SPACE; if (find_space >> 8 != 0) { KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); alignment = (vm_offset_t)1 << (find_space >> 8); } else alignment = 0; en_aslr = (map->flags & MAP_ASLR) != 0; update_anon = cluster = clustering_anon_allowed(*addr) && (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 && find_space != VMFS_NO_SPACE && object == NULL && (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP | MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE; curr_min_addr = min_addr = *addr; if (en_aslr && min_addr == 0 && !cluster && find_space != VMFS_NO_SPACE && (map->flags & MAP_ASLR_IGNSTART) != 0) curr_min_addr = min_addr = vm_map_min(map); try = 0; vm_map_lock(map); if (cluster) { curr_min_addr = map->anon_loc; if (curr_min_addr == 0) cluster = false; } if (find_space != VMFS_NO_SPACE) { KASSERT(find_space == VMFS_ANY_SPACE || find_space == VMFS_OPTIMAL_SPACE || find_space == VMFS_SUPER_SPACE || alignment != 0, ("unexpected VMFS flag")); again: /* * When creating an anonymous mapping, try clustering * with an existing anonymous mapping first. * * We make up to two attempts to find address space * for a given find_space value. The first attempt may * apply randomization or may cluster with an existing * anonymous mapping. If this first attempt fails, * perform a first-fit search of the available address * space. * * If all tries failed, and find_space is * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE. * Again enable clustering and randomization. */ try++; MPASS(try <= 2); if (try == 2) { /* * Second try: we failed either to find a * suitable region for randomizing the * allocation, or to cluster with an existing * mapping. Retry with free run. */ curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ? vm_map_min(map) : min_addr; atomic_add_long(&aslr_restarts, 1); } if (try == 1 && en_aslr && !cluster) { /* * Find space for allocation, including * gap needed for later randomization. */ pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 && (find_space == VMFS_SUPER_SPACE || find_space == VMFS_OPTIMAL_SPACE) ? 1 : 0; gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR && (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ? aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx]; *addr = vm_map_findspace(map, curr_min_addr, length + gap * pagesizes[pidx]); if (*addr + length + gap * pagesizes[pidx] > vm_map_max(map)) goto again; /* And randomize the start address. */ *addr += (arc4random() % gap) * pagesizes[pidx]; if (max_addr != 0 && *addr + length > max_addr) goto again; } else { *addr = vm_map_findspace(map, curr_min_addr, length); if (*addr + length > vm_map_max(map) || (max_addr != 0 && *addr + length > max_addr)) { if (cluster) { cluster = false; MPASS(try == 1); goto again; } rv = KERN_NO_SPACE; goto done; } } if (find_space != VMFS_ANY_SPACE && (rv = vm_map_alignspace(map, object, offset, addr, length, max_addr, alignment)) != KERN_SUCCESS) { if (find_space == VMFS_OPTIMAL_SPACE) { find_space = VMFS_ANY_SPACE; curr_min_addr = min_addr; cluster = update_anon; try = 0; goto again; } goto done; } } else if ((cow & MAP_REMAP) != 0) { if (!vm_map_range_valid(map, *addr, *addr + length)) { rv = KERN_INVALID_ADDRESS; goto done; } vm_map_delete(map, *addr, *addr + length); } if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot, max, cow); } else { rv = vm_map_insert(map, object, offset, *addr, *addr + length, prot, max, cow); } if (rv == KERN_SUCCESS && update_anon) map->anon_loc = *addr + length; done: vm_map_unlock(map); return (rv); } /* * vm_map_find_min() is a variant of vm_map_find() that takes an * additional parameter (min_addr) and treats the given address * (*addr) differently. Specifically, it treats *addr as a hint * and not as the minimum address where the mapping is created. * * This function works in two phases. First, it tries to * allocate above the hint. If that fails and the hint is * greater than min_addr, it performs a second pass, replacing * the hint with min_addr as the minimum address for the * allocation. */ int vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t hint; int rv; hint = *addr; for (;;) { rv = vm_map_find(map, object, offset, addr, length, max_addr, find_space, prot, max, cow); if (rv == KERN_SUCCESS || min_addr >= hint) return (rv); *addr = hint = min_addr; } } /* * A map entry with any of the following flags set must not be merged with * another entry. */ #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \ MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC) static bool vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry) { KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 || (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0, ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable", prev, entry)); return (prev->end == entry->start && prev->object.vm_object == entry->object.vm_object && (prev->object.vm_object == NULL || prev->offset + (prev->end - prev->start) == entry->offset) && prev->eflags == entry->eflags && prev->protection == entry->protection && prev->max_protection == entry->max_protection && prev->inheritance == entry->inheritance && prev->wired_count == entry->wired_count && prev->cred == entry->cred); } static void vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry) { /* * If the backing object is a vnode object, vm_object_deallocate() * calls vrele(). However, vrele() does not lock the vnode because * the vnode has additional references. Thus, the map lock can be * kept without causing a lock-order reversal with the vnode lock. * * Since we count the number of virtual page mappings in * object->un_pager.vnp.writemappings, the writemappings value * should not be adjusted when the entry is disposed of. */ if (entry->object.vm_object != NULL) vm_object_deallocate(entry->object.vm_object); if (entry->cred != NULL) crfree(entry->cred); vm_map_entry_dispose(map, entry); } /* * vm_map_simplify_entry: * * Simplify the given map entry by merging with either neighbor. This * routine also has the ability to merge with both neighbors. * * The map must be locked. * * This routine guarantees that the passed entry remains valid (though * possibly extended). When merging, this routine may delete one or * both neighbors. */ void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev; if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) != 0) return; prev = entry->prev; if (vm_map_mergeable_neighbors(prev, entry)) { vm_map_entry_unlink(map, prev, UNLINK_MERGE_NEXT); vm_map_merged_neighbor_dispose(map, prev); } next = entry->next; if (vm_map_mergeable_neighbors(entry, next)) { vm_map_entry_unlink(map, next, UNLINK_MERGE_PREV); vm_map_merged_neighbor_dispose(map, next); } } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->end > start && entry->start < start, ("_vm_map_clip_start: invalid clip of entry %p", entry)); /* * Split off the front portion -- note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ vm_map_simplify_entry(map, entry); /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; if (entry->cred != NULL) { object->cred = entry->cred; object->charge = entry->end - entry->start; entry->cred = NULL; } } else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; if (new_entry->cred != NULL) crhold(entry->cred); vm_map_entry_link(map, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); vm_map_entry_set_vnode_text(new_entry, true); /* * The object->un_pager.vnp.writemappings for the * object of MAP_ENTRY_WRITECNT type entry shall be * kept as is here. The virtual pages are * re-distributed among the clipped entries, so the sum is * left the same. */ } } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if ((endaddr) < (entry->end)) \ _vm_map_clip_end((map), (entry), (endaddr)); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) { vm_map_entry_t new_entry; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->start < end && entry->end > end, ("_vm_map_clip_end: invalid clip of entry %p", entry)); /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map && (entry->eflags & MAP_ENTRY_GUARD) == 0) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; if (entry->cred != NULL) { object->cred = entry->cred; object->charge = entry->end - entry->start; entry->cred = NULL; } } else if (entry->object.vm_object != NULL && ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); KASSERT(entry->object.vm_object->cred == NULL, ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry)); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = entry->end - entry->start; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } /* * Create a new entry and insert it AFTER the specified entry */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); if (new_entry->cred != NULL) crhold(entry->cred); vm_map_entry_link(map, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); vm_map_entry_set_vnode_text(new_entry, true); } } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result; result = KERN_INVALID_ARGUMENT; vm_map_lock(submap); submap->flags |= MAP_IS_SUB_MAP; vm_map_unlock(submap); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && ((entry->eflags & MAP_ENTRY_COW) == 0) && (entry->object.vm_object == NULL)) { entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } vm_map_unlock(map); if (result != KERN_SUCCESS) { vm_map_lock(submap); submap->flags &= ~MAP_IS_SUB_MAP; vm_map_unlock(submap); } return (result); } /* * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload the specified map's pmap with mappings to the specified * object's memory-resident pages. No further physical pages are * allocated, and no further virtual pages are retrieved from secondary * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a * limited number of page mappings are created at the low-end of the * specified address range. (For this purpose, a superpage mapping * counts as one page mapping.) Otherwise, all resident pages within * the specified address range are mapped. */ static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t start; vm_page_t p, p_start; vm_pindex_t mask, psize, threshold, tmpidx; if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) return; VM_OBJECT_RLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { VM_OBJECT_RUNLOCK(object); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); VM_OBJECT_WUNLOCK(object); return; } VM_OBJECT_LOCK_DOWNGRADE(object); } psize = atop(size); if (psize + pindex > object->size) { if (pindex >= object->size) { VM_OBJECT_RUNLOCK(object); return; } psize = object->size - pindex; } start = 0; p_start = NULL; threshold = MAX_INIT_PT; p = vm_page_find_least(object, pindex); /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if (((flags & MAP_PREFAULT_MADVISE) != 0 && vm_page_count_severe()) || ((flags & MAP_PREFAULT_PARTIAL) != 0 && tmpidx >= threshold)) { psize = tmpidx; break; } if (p->valid == VM_PAGE_BITS_ALL) { if (p_start == NULL) { start = addr + ptoa(tmpidx); p_start = p; } /* Jump ahead if a superpage mapping is possible. */ if (p->psind > 0 && ((addr + ptoa(tmpidx)) & (pagesizes[p->psind] - 1)) == 0) { mask = atop(pagesizes[p->psind]) - 1; if (tmpidx + mask < psize && vm_page_ps_test(p, PS_ALL_VALID, NULL)) { p += mask; threshold += mask; } } } else if (p_start != NULL) { pmap_enter_object(map->pmap, start, addr + ptoa(tmpidx), p_start, prot); p_start = NULL; } } if (p_start != NULL) pmap_enter_object(map->pmap, start, addr + ptoa(psize), p_start, prot); VM_OBJECT_RUNLOCK(object); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { vm_map_entry_t current, entry, in_tran; vm_object_t obj; struct ucred *cred; vm_prot_t old_prot; if (start == end) return (KERN_SUCCESS); again: in_tran = NULL; vm_map_lock(map); /* * Ensure that we are not concurrently wiring pages. vm_map_wire() may * need to fault pages into the map and will drop the map lock while * doing so, and the VM object may end up in an inconsistent state if we * update the protection on the map entry in between faults. */ vm_map_wait_busy(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else { entry = entry->next; } /* * Make a first pass to check for protection violations. */ for (current = entry; current->start < end; current = current->next) { if ((current->eflags & MAP_ENTRY_GUARD) != 0) continue; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0) in_tran = current; } /* * Postpone the operation until all in-transition map entries have * stabilized. An in-transition entry might already have its pages * wired and wired_count incremented, but not yet have its * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call * vm_fault_copy_entry() in the final loop below. */ if (in_tran != NULL) { in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(map, 0); goto again; } /* * Do an accounting pass for private read-only mappings that * now will do cow due to allowed write (e.g. debugger sets * breakpoint on text segment) */ for (current = entry; current->start < end; current = current->next) { vm_map_clip_end(map, current, end); if (set_max || ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 || ENTRY_CHARGED(current) || (current->eflags & MAP_ENTRY_GUARD) != 0) { continue; } cred = curthread->td_ucred; obj = current->object.vm_object; if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) { if (!swap_reserve(current->end - current->start)) { vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } crhold(cred); current->cred = cred; continue; } VM_OBJECT_WLOCK(obj); if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { VM_OBJECT_WUNLOCK(obj); continue; } /* * Charge for the whole object allocation now, since * we cannot distinguish between non-charged and * charged clipped mapping of the same object later. */ KASSERT(obj->charge == 0, ("vm_map_protect: object %p overcharged (entry %p)", obj, current)); if (!swap_reserve(ptoa(obj->size))) { VM_OBJECT_WUNLOCK(obj); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } crhold(cred); obj->cred = cred; obj->charge = ptoa(obj->size); VM_OBJECT_WUNLOCK(obj); } /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ for (current = entry; current->start < end; current = current->next) { if ((current->eflags & MAP_ENTRY_GUARD) != 0) continue; old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * For user wired map entries, the normal lazy evaluation of * write access upgrades through soft page faults is * undesirable. Instead, immediately copy any pages that are * copy-on-write and enable write access in the physical map. */ if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 && (current->protection & VM_PROT_WRITE) != 0 && (old_prot & VM_PROT_WRITE) == 0) vm_fault_copy_entry(map, map, current, current, NULL); /* * When restricting access, update the physical map. Worry * about copy-on-write here. */ if ((old_prot & ~current->protection) != 0) { #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(current)); #undef MASK } vm_map_simplify_entry(map, current); } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t current, entry; bool modify_map; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: if (start == end) return (0); modify_map = true; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: if (start == end) return (0); modify_map = false; vm_map_lock_read(map); break; default: return (EINVAL); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { if (modify_map) vm_map_clip_start(map, entry, start); } else { entry = entry->next; } if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ for (current = entry; current->start < end; current = current->next) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; vm_map_clip_end(map, current, end); switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: current->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: current->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: current->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: current->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_simplify_entry(map, current); } vm_map_unlock(map); } else { vm_pindex_t pstart, pend; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ for (current = entry; current->start < end; current = current->next) { vm_offset_t useEnd, useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; /* * MADV_FREE would otherwise rewind time to * the creation of the shadow object. Because * we hold the VM map read-locked, neither the * entry's object nor the presence of a * backing object can change. */ if (behav == MADV_FREE && current->object.vm_object != NULL && current->object.vm_object->backing_object != NULL) continue; pstart = OFF_TO_IDX(current->offset); pend = pstart + atop(current->end - current->start); useStart = current->start; useEnd = current->end; if (current->start < start) { pstart += atop(start - current->start); useStart = start; } if (current->end > end) { pend -= atop(current->end - end); useEnd = end; } if (pstart >= pend) continue; /* * Perform the pmap_advise() before clearing * PGA_REFERENCED in vm_page_advise(). Otherwise, a * concurrent pmap operation, such as pmap_remove(), * could clear a reference in the pmap and set * PGA_REFERENCED on the page before the pmap_advise() * had completed. Consequently, the page would appear * referenced based upon an old reference that * occurred before this pmap_advise() ran. */ if (behav == MADV_DONTNEED || behav == MADV_FREE) pmap_advise(map->pmap, useStart, useEnd, behav); vm_object_madvise(current->object.vm_object, pstart, pend, behav); /* * Pre-populate paging structures in the * WILLNEED case. For wired entries, the * paging structures are already populated. */ if (behav == MADV_WILLNEED && current->wired_count == 0) { vm_map_pmap_enter(map, useStart, current->protection, current->object.vm_object, pstart, ptoa(pend - pstart), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vmspace_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: case VM_INHERIT_ZERO: break; default: return (KERN_INVALID_ARGUMENT); } if (start == end) return (KERN_SUCCESS); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; while (entry->start < end) { vm_map_clip_end(map, entry, end); if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || new_inheritance != VM_INHERIT_ZERO) entry->inheritance = new_inheritance; vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t saved_start; unsigned int last_timestamp; int rv; boolean_t need_wakeup, result, user_unwire; if (start == end) return (KERN_SUCCESS); user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user unwiring? */ } vm_map_lock(map); if (last_timestamp+1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * First_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && entry->next->start > entry->end)) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } /* * If system unwiring, require that the entry is system wired. */ if (!user_unwire && vm_map_entry_system_wired_count(entry) == 0) { end = entry->end; rv = KERN_INVALID_ARGUMENT; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_unwire: lookup failed")); } for (entry = first_entry; entry->start < end; entry = entry->next) { /* * If VM_MAP_WIRE_HOLESOK was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for draining * MAP_ENTRY_IN_TRANSITION. Moreover, another thread * could be simultaneously wiring this new mapping * entry. Detect these cases and skip any entries * marked as in transition by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, ("vm_map_unwire: !HOLESOK and new/changed entry")); continue; } if (rv == KERN_SUCCESS && (!user_unwire || (entry->eflags & MAP_ENTRY_USER_WIRED))) { if (entry->wired_count == 1) vm_map_entry_unwire(map, entry); else entry->wired_count--; if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; } KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_unwire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_unwire: alien wire %p", entry)); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } static void vm_map_wire_user_count_sub(u_long npages) { atomic_subtract_long(&vm_user_wire_count, npages); } static bool vm_map_wire_user_count_add(u_long npages) { u_long wired; wired = vm_user_wire_count; do { if (npages + wired > (u_long)vm_page_max_user_wired) return (false); } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired, npages + wired)); return (true); } /* * vm_map_wire_entry_failure: * * Handle a wiring failure on the given entry. * * The map should be locked. */ static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, vm_offset_t failed_addr) { VM_MAP_ASSERT_LOCKED(map); KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && entry->wired_count == 1, ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); KASSERT(failed_addr < entry->end, ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); /* * If any pages at the start of this entry were successfully wired, * then unwire them. */ if (failed_addr > entry->start) { pmap_unwire(map->pmap, entry->start, failed_addr); vm_object_unwire(entry->object.vm_object, entry->offset, failed_addr - entry->start, PQ_ACTIVE); } /* * Assign an out-of-range value to represent the failure to wire this * entry. */ entry->wired_count = -1; } int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { int rv; vm_map_lock(map); rv = vm_map_wire_locked(map, start, end, flags); vm_map_unlock(map); return (rv); } /* * vm_map_wire_locked: * * Implements both kernel and user wiring. Returns with the map locked, * the map lock may be dropped. */ int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t faddr, saved_end, saved_start; u_long npages; u_int last_timestamp; int rv; boolean_t need_wakeup, result, user_wire; vm_prot_t prot; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); prot = 0; if (flags & VM_MAP_WIRE_WRITE) prot |= VM_PROT_WRITE; user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else return (KERN_INVALID_ADDRESS); } last_timestamp = map->timestamp; entry = first_entry; while (entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, 0)) { /* * Allow interruption of user wiring? */ } vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * first_entry has been deleted. */ return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && entry->wiring_thread == NULL, ("owned map entry %p", entry)); entry->eflags |= MAP_ENTRY_IN_TRANSITION; entry->wiring_thread = curthread; if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || (entry->protection & prot) != prot) { entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; if ((flags & VM_MAP_WIRE_HOLESOK) == 0) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } goto next_entry; } if (entry->wired_count == 0) { entry->wired_count++; npages = atop(entry->end - entry->start); if (user_wire && !vm_map_wire_user_count_add(npages)) { vm_map_wire_entry_failure(map, entry, entry->start); end = entry->end; rv = KERN_RESOURCE_SHORTAGE; goto done; } /* * Release the map lock, relying on the in-transition * mark. Mark the map busy for fork. */ saved_start = entry->start; saved_end = entry->end; vm_map_busy(map); vm_map_unlock(map); faddr = saved_start; do { /* * Simulate a fault to get the page and enter * it into the physical map. */ if ((rv = vm_fault(map, faddr, VM_PROT_NONE, VM_FAULT_WIRE, NULL)) != KERN_SUCCESS) break; } while ((faddr += PAGE_SIZE) < saved_end); vm_map_lock(map); vm_map_unbusy(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ result = vm_map_lookup_entry(map, saved_start, &tmp_entry); KASSERT(result, ("vm_map_wire: lookup failed")); if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; while (entry->end < saved_end) { /* * In case of failure, handle entries * that were not fully wired here; * fully wired entries are handled * later. */ if (rv != KERN_SUCCESS && faddr < entry->end) vm_map_wire_entry_failure(map, entry, faddr); entry = entry->next; } } last_timestamp = map->timestamp; if (rv != KERN_SUCCESS) { vm_map_wire_entry_failure(map, entry, faddr); if (user_wire) vm_map_wire_user_count_sub(npages); end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ next_entry: if ((flags & VM_MAP_WIRE_HOLESOK) == 0 && entry->end < end && entry->next->start > entry->end) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_wire: lookup failed")); } for (entry = first_entry; entry->start < end; entry = entry->next) { /* * If VM_MAP_WIRE_HOLESOK was specified, an empty * space in the unwired region could have been mapped * while the map lock was dropped for faulting in the * pages or draining MAP_ENTRY_IN_TRANSITION. * Moreover, another thread could be simultaneously * wiring this new mapping entry. Detect these cases * and skip any entries marked as in transition not by us. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || entry->wiring_thread != curthread) { KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, ("vm_map_wire: !HOLESOK and new/changed entry")); continue; } if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) goto next_entry_done; if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { /* * Undo the wiring. Wiring succeeded on this entry * but failed on a later entry. */ if (entry->wired_count == 1) { vm_map_entry_unwire(map, entry); if (user_wire) vm_map_wire_user_count_sub( atop(entry->end - entry->start)); } else entry->wired_count--; } next_entry_done: KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, ("vm_map_wire: in-transition flag missing %p", entry)); KASSERT(entry->wiring_thread == curthread, ("vm_map_wire: alien wire %p", entry)); entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WIRE_SKIPPED); entry->wiring_thread = NULL; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); } if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; unsigned int last_timestamp; boolean_t failed; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = entry->start; end = entry->end; } /* * Make a first pass to check for user-wired memory and holes. */ for (current = entry; current->start < end; current = current->next) { if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && current->end != current->next->start) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) pmap_remove(map->pmap, start, end); failed = FALSE; /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current->start < end;) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } vm_object_reference(object); last_timestamp = map->timestamp; vm_map_unlock_read(map); if (!vm_object_sync(object, offset, size, syncio, invalidate)) failed = TRUE; start += size; vm_object_deallocate(object); vm_map_lock_read(map); if (last_timestamp == map->timestamp || !vm_map_lookup_entry(map, start, ¤t)) current = current->next; } vm_map_unlock_read(map); return (failed ? KERN_FAILURE : KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { vm_size_t size; VM_MAP_ASSERT_LOCKED(map); KASSERT(entry->wired_count > 0, ("vm_map_entry_unwire: entry %p isn't wired", entry)); size = entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) vm_map_wire_user_count_sub(atop(size)); pmap_unwire(map->pmap, entry->start, entry->end); vm_object_unwire(entry->object.vm_object, entry->offset, size, PQ_ACTIVE); entry->wired_count = 0; } static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) vm_object_deallocate(entry->object.vm_object); uma_zfree(system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, count, size1; vm_size_t size; vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE); object = entry->object.vm_object; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { MPASS(entry->cred == NULL); MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); MPASS(object == NULL); vm_map_entry_deallocate(entry, map->system_map); return; } size = entry->end - entry->start; map->size -= size; if (entry->cred != NULL) { swap_release_by_cred(size, entry->cred); crfree(entry->cred); } if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object != NULL)) { KASSERT(entry->cred == NULL || object->cred == NULL || (entry->eflags & MAP_ENTRY_NEEDS_COPY), ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); count = atop(size); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_WLOCK(object); if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT | OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || object == kernel_object)) { vm_object_collapse(object); /* * The option OBJPR_NOTMAPPED can be passed here * because vm_map_delete() already performed * pmap_remove() on the only mapping to this range * of pages. */ vm_object_page_remove(object, offidxstart, offidxend, OBJPR_NOTMAPPED); if (object->type == OBJT_SWAP) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) { size1 = object->size; object->size = offidxstart; if (object->cred != NULL) { size1 -= object->size; KASSERT(object->charge >= ptoa(size1), ("object %p charge < 0", object)); swap_release_by_cred(ptoa(size1), object->cred); object->charge -= ptoa(size1); } } } VM_OBJECT_WUNLOCK(object); } else entry->object.vm_object = NULL; if (map->system_map) vm_map_entry_deallocate(entry, TRUE); else { entry->next = curthread->td_map_def_user; curthread->td_map_def_user = entry; } } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry; vm_map_entry_t first_entry; VM_MAP_ASSERT_LOCKED(map); if (start == end) return (KERN_SUCCESS); /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { entry = first_entry; vm_map_clip_start(map, entry, start); } /* * Step through all entries in this region */ while (entry->start < end) { vm_map_entry_t next; /* * Wait for wiring or unwiring of an entry to complete. * Also wait for any system wirings to disappear on * user maps. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || (vm_map_pmap(map) != kernel_pmap && vm_map_entry_system_wired_count(entry) != 0)) { unsigned int last_timestamp; vm_offset_t saved_start; vm_map_entry_t tmp_entry; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, 0); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) entry = tmp_entry->next; else { entry = tmp_entry; vm_map_clip_start(map, entry, saved_start); } } continue; } vm_map_clip_end(map, entry, end); next = entry->next; /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); /* * Remove mappings for the pages, but only if the * mappings could exist. For instance, it does not * make sense to call pmap_remove() for guard entries. */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || entry->object.vm_object != NULL) pmap_remove(map->pmap, entry->start, entry->end); if (entry->end == map->anon_loc) map->anon_loc = entry->start; /* * Delete the entry only after removing all pmap * entries pointing to its pages. (Otherwise, its * page frames may be reallocated, and any modify bits * will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = entry->next; } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry, vm_ooffset_t *fork_charge) { vm_object_t src_object; vm_map_entry_t fake_entry; vm_offset_t size; struct ucred *cred; int charged; VM_MAP_ASSERT_LOCKED(dst_map); if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0 || (src_entry->protection & VM_PROT_WRITE) == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && (src_entry->protection & VM_PROT_WRITE) != 0) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } /* * Make a copy of the object. */ size = src_entry->end - src_entry->start; if ((src_object = src_entry->object.vm_object) != NULL) { VM_OBJECT_WLOCK(src_object); charged = ENTRY_CHARGED(src_entry); if (src_object->handle == NULL && (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP)) { vm_object_collapse(src_object); if ((src_object->flags & (OBJ_NOSPLIT | OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); if (src_entry->cred != NULL && !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { KASSERT(src_object->cred == NULL, ("OVERCOMMIT: vm_map_copy_entry: cred %p", src_object)); src_object->cred = src_entry->cred; src_object->charge = size; } VM_OBJECT_WUNLOCK(src_object); dst_entry->object.vm_object = src_object; if (charged) { cred = curthread->td_ucred; crhold(cred); dst_entry->cred = cred; *fork_charge += size; if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { crhold(cred); src_entry->cred = cred; *fork_charge += size; } } src_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; dst_entry->offset = src_entry->offset; if (src_entry->eflags & MAP_ENTRY_WRITECNT) { /* * MAP_ENTRY_WRITECNT cannot * indicate write reference from * src_entry, since the entry is * marked as needs copy. Allocate a * fake entry that is used to * decrement object->un_pager writecount * at the appropriate time. Attach * fake_entry to the deferred list. */ fake_entry = vm_map_entry_create(dst_map); fake_entry->eflags = MAP_ENTRY_WRITECNT; src_entry->eflags &= ~MAP_ENTRY_WRITECNT; vm_object_reference(src_object); fake_entry->object.vm_object = src_object; fake_entry->start = src_entry->start; fake_entry->end = src_entry->end; fake_entry->next = curthread->td_map_def_user; curthread->td_map_def_user = fake_entry; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; if (src_entry->cred != NULL) { dst_entry->cred = curthread->td_ucred; crhold(dst_entry->cred); *fork_charge += size; } } } else { /* * We don't want to make writeable wired pages copy-on-write. * Immediately copy these pages into the new map by simulating * page faults. The new pages are pageable. */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, fork_charge); } } /* * vmspace_map_entry_forked: * Update the newly-forked vmspace each time a map entry is inherited * or copied. The values for vm_dsize and vm_tsize are approximate * (and mostly-obsolete ideas in the face of mmap(2) et al.) */ static void vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_map_entry_t entry) { vm_size_t entrysize; vm_offset_t newend; if ((entry->eflags & MAP_ENTRY_GUARD) != 0) return; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { vm2->vm_ssize += btoc(entrysize); } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); vm2->vm_dsize += btoc(newend - entry->start); } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { newend = MIN(entry->end, (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); vm2->vm_tsize += btoc(newend - entry->start); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * XXX It might be worth coalescing the entries added to the new vmspace. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) { struct vmspace *vm2; vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; int error, locked; vm_inherit_t inh; old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map), pmap_pinit); if (vm2 == NULL) return (NULL); vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); new_map = &vm2->vm_map; locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); if (error != 0) { sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); vmspace_free(vm2); return (NULL); } new_map->anon_loc = old_map->anon_loc; new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART); old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) panic("vm_map_fork: encountered a submap"); inh = old_entry->inheritance; if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && inh != VM_INHERIT_NONE) inh = VM_INHERIT_COPY; switch (inh) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if necessary. */ object = old_entry->object.vm_object; if (object == NULL) { object = vm_object_allocate(OBJT_DEFAULT, atop(old_entry->end - old_entry->start)); old_entry->object.vm_object = object; old_entry->offset = 0; if (old_entry->cred != NULL) { object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, old_entry->end - old_entry->start); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; /* Transfer the second reference too. */ vm_object_reference( old_entry->object.vm_object); /* * As in vm_map_simplify_entry(), the * vnode lock will not be acquired in * this call to vm_object_deallocate(). */ vm_object_deallocate(object); object = old_entry->object.vm_object; } VM_OBJECT_WLOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); if (old_entry->cred != NULL) { KASSERT(object->cred == NULL, ("vmspace_fork both cred")); object->cred = old_entry->cred; object->charge = old_entry->end - old_entry->start; old_entry->cred = NULL; } /* * Assert the correct state of the vnode * v_writecount while the object is locked, to * not relock it later for the assertion * correctness. */ if (old_entry->eflags & MAP_ENTRY_WRITECNT && object->type == OBJT_VNODE) { KASSERT(((struct vnode *)object->handle)-> v_writecount > 0, ("vmspace_fork: v_writecount %p", object)); KASSERT(object->un_pager.vnp.writemappings > 0, ("vmspace_fork: vnp.writecount %p", object)); } VM_OBJECT_WUNLOCK(object); /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; if (new_entry->eflags & MAP_ENTRY_WRITECNT) { vm_pager_update_writecount(object, new_entry->start, new_entry->end); } vm_map_entry_set_vnode_text(new_entry, true); /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; /* * Copied entry is COW over the old object. */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT); new_entry->wiring_thread = NULL; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry, fork_charge); vm_map_entry_set_vnode_text(new_entry, true); break; case VM_INHERIT_ZERO: /* * Create a new anonymous mapping entry modelled from * the old one. */ new_entry = vm_map_entry_create(new_map); memset(new_entry, 0, sizeof(*new_entry)); new_entry->start = old_entry->start; new_entry->end = old_entry->end; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC); new_entry->protection = old_entry->protection; new_entry->max_protection = old_entry->max_protection; new_entry->inheritance = VM_INHERIT_ZERO; vm_map_entry_link(new_map, new_entry); vmspace_map_entry_forked(vm1, vm2, new_entry); new_entry->cred = curthread->td_ucred; crhold(new_entry->cred); *fork_charge += (new_entry->end - new_entry->start); break; } old_entry = old_entry->next; } /* * Use inlined vm_map_unlock() to postpone handling the deferred * map entries, which cannot be done until both old_map and * new_map locks are released. */ sx_xunlock(&old_map->lock); sx_xunlock(&new_map->lock); vm_map_process_deferred(); return (vm2); } /* * Create a process's stack for exec_new_vmspace(). This function is never * asked to wire the newly created stack. */ int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_size_t growsize, init_ssize; rlim_t vmemlim; int rv; MPASS((map->flags & MAP_WIREFUTURE) == 0); growsize = sgrowsiz; init_ssize = (max_ssize < growsize) ? max_ssize : growsize; vm_map_lock(map); vmemlim = lim_cur(curthread, RLIMIT_VMEM); /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { rv = KERN_NO_SPACE; goto out; } rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, max, cow); out: vm_map_unlock(map); return (rv); } static int stack_guard_page = 1; SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, &stack_guard_page, 0, "Specifies the number of guard pages for a stack that grows"); static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, gap_bot, gap_top, top; vm_size_t init_ssize, sgp; int orient, rv; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. */ orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); KASSERT(orient != 0, ("No stack grow direction")); KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), ("bi-dir stack")); if (max_ssize == 0 || !vm_map_range_valid(map, addrbos, addrbos + max_ssize)) return (KERN_INVALID_ADDRESS); sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : (vm_size_t)stack_guard_page * PAGE_SIZE; if (sgp >= max_ssize) return (KERN_INVALID_ARGUMENT); init_ssize = growsize; if (max_ssize < init_ssize + sgp) init_ssize = max_ssize - sgp; /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) return (KERN_NO_SPACE); /* * If we can't accommodate max_ssize in the current mapping, no go. */ if (prev_entry->next->start < addrbos + max_ssize) return (KERN_NO_SPACE); /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) { bot = addrbos + max_ssize - init_ssize; top = bot + init_ssize; gap_bot = addrbos; gap_top = bot; } else /* if (orient == MAP_STACK_GROWS_UP) */ { bot = addrbos; top = bot + init_ssize; gap_bot = top; gap_top = addrbos + max_ssize; } rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); if (rv != KERN_SUCCESS) return (rv); new_entry = prev_entry->next; KASSERT(new_entry->end == top || new_entry->start == bot, ("Bad entry start/end for new stack entry")); KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, ("new entry lacks MAP_ENTRY_GROWS_UP")); if (gap_bot == gap_top) return (KERN_SUCCESS); rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); if (rv == KERN_SUCCESS) { /* * Gap can never successfully handle a fault, so * read-ahead logic is never used for it. Re-use * next_read of the gap entry to store * stack_guard_page for vm_map_growstack(). */ if (orient == MAP_STACK_GROWS_DOWN) new_entry->prev->next_read = sgp; else new_entry->next->next_read = sgp; } else { (void)vm_map_delete(map, bot, top); } return (rv); } /* * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we * successfully grow the stack. */ static int vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) { vm_map_entry_t stack_entry; struct proc *p; struct vmspace *vm; struct ucred *cred; vm_offset_t gap_end, gap_start, grow_start; size_t grow_amount, guard, max_grow; rlim_t lmemlim, stacklim, vmemlim; int rv, rv1; bool gap_deleted, grow_down, is_procstack; #ifdef notyet uint64_t limit; #endif #ifdef RACCT int error; #endif p = curproc; vm = p->p_vmspace; /* * Disallow stack growth when the access is performed by a * debugger or AIO daemon. The reason is that the wrong * resource limits are applied. */ if (p != initproc && (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)) return (KERN_FAILURE); MPASS(!map->system_map); lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); stacklim = lim_cur(curthread, RLIMIT_STACK); vmemlim = lim_cur(curthread, RLIMIT_VMEM); retry: /* If addr is not in a hole for a stack grow area, no need to grow. */ if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) return (KERN_FAILURE); if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) return (KERN_SUCCESS); if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { stack_entry = gap_entry->next; if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || stack_entry->start != gap_entry->end) return (KERN_FAILURE); grow_amount = round_page(stack_entry->start - addr); grow_down = true; } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { stack_entry = gap_entry->prev; if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || stack_entry->end != gap_entry->start) return (KERN_FAILURE); grow_amount = round_page(addr + 1 - stack_entry->end); grow_down = false; } else { return (KERN_FAILURE); } guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 || (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 : gap_entry->next_read; max_grow = gap_entry->end - gap_entry->start; if (guard > max_grow) return (KERN_NO_SPACE); max_grow -= guard; if (grow_amount > max_grow) return (KERN_NO_SPACE); /* * If this is the main process stack, see if we're over the stack * limit. */ is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && addr < (vm_offset_t)p->p_sysent->sv_usrstack; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) return (KERN_NO_SPACE); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (is_procstack && racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { PROC_UNLOCK(p); return (KERN_NO_SPACE); } PROC_UNLOCK(p); } #endif grow_amount = roundup(grow_amount, sgrowsiz); if (grow_amount > max_grow) grow_amount = max_grow; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } #ifdef notyet PROC_LOCK(p); limit = racct_get_available(p, RACCT_STACK); PROC_UNLOCK(p); if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) grow_amount = limit - ctob(vm->vm_ssize); #endif if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { rv = KERN_NO_SPACE; goto out; } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { PROC_UNLOCK(p); rv = KERN_NO_SPACE; goto out; } PROC_UNLOCK(p); } #endif if (vm_map_lock_upgrade(map)) { gap_entry = NULL; vm_map_lock_read(map); goto retry; } if (grow_down) { grow_start = gap_entry->end - grow_amount; if (gap_entry->start + grow_amount == gap_entry->end) { gap_start = gap_entry->start; gap_end = gap_entry->end; vm_map_entry_delete(map, gap_entry); gap_deleted = true; } else { MPASS(gap_entry->start < gap_entry->end - grow_amount); gap_entry->end -= grow_amount; vm_map_entry_resize_free(map, gap_entry); gap_deleted = false; } rv = vm_map_insert(map, NULL, 0, grow_start, grow_start + grow_amount, stack_entry->protection, stack_entry->max_protection, MAP_STACK_GROWS_DOWN); if (rv != KERN_SUCCESS) { if (gap_deleted) { rv1 = vm_map_insert(map, NULL, 0, gap_start, gap_end, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); MPASS(rv1 == KERN_SUCCESS); } else { gap_entry->end += grow_amount; vm_map_entry_resize_free(map, gap_entry); } } } else { grow_start = stack_entry->end; cred = stack_entry->cred; if (cred == NULL && stack_entry->object.vm_object != NULL) cred = stack_entry->object.vm_object->cred; if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) rv = KERN_NO_SPACE; /* Grow the underlying object if applicable. */ else if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), (vm_size_t)grow_amount, cred != NULL)) { if (gap_entry->start + grow_amount == gap_entry->end) vm_map_entry_delete(map, gap_entry); else gap_entry->start += grow_amount; stack_entry->end += grow_amount; map->size += grow_amount; vm_map_entry_resize_free(map, stack_entry); rv = KERN_SUCCESS; } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { rv = vm_map_wire_locked(map, grow_start, grow_start + grow_amount, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); } vm_map_lock_downgrade(map); out: #ifdef RACCT if (racct_enable && rv != KERN_SUCCESS) { PROC_LOCK(p); error = racct_set(p, RACCT_VMEM, map->size); KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); if (!old_mlock) { error = racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); } error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); KASSERT(error == 0, ("decreasing RACCT_STACK failed")); PROC_UNLOCK(p); } #endif return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ int vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, ("vmspace_exec recursed")); newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit); if (newvmspace == NULL) return (ENOMEM); newvmspace->vm_swrss = oldvmspace->vm_swrss; /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); curthread->td_pflags |= TDP_EXECVMSPC; return (0); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ int vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; vm_ooffset_t fork_charge; if (oldvmspace->vm_refcnt == 1) return (0); fork_charge = 0; newvmspace = vmspace_fork(oldvmspace, &fork_charge); if (newvmspace == NULL) return (ENOMEM); if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { vmspace_free(newvmspace); return (ENOMEM); } PROC_VMSPACE_LOCK(p); p->p_vmspace = newvmspace; PROC_VMSPACE_UNLOCK(p); if (p == curthread->td_proc) pmap_activate(curthread); vmspace_free(oldvmspace); return (0); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type; vm_object_t eobject; vm_size_t size; struct ucred *cred; RetryLookup: vm_map_lock_read(map); RetryLookupLocked: /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } entry = *out_entry; /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { fault_typea &= ~VM_PROT_FAULT_LOOKUP; if (prot == VM_PROT_NONE && map != kernel_map && (entry->eflags & MAP_ENTRY_GUARD) != 0 && (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | MAP_ENTRY_STACK_GAP_UP)) != 0 && vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) goto RetryLookupLocked; } fault_type = fault_typea & VM_PROT_ALL; if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), ("entry %p flags %x", entry, entry->eflags)); if ((fault_typea & VM_PROT_COPY) != 0 && (entry->max_protection & VM_PROT_WRITE) == 0 && (entry->eflags & MAP_ENTRY_COW) == 0) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; size = entry->end - entry->start; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if ((fault_type & VM_PROT_WRITE) != 0 || (fault_typea & VM_PROT_COPY) != 0) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; if (entry->cred == NULL) { /* * The debugger owner is charged for * the memory. */ cred = curthread->td_ucred; crhold(cred); if (!swap_reserve_by_cred(size, cred)) { crfree(cred); vm_map_unlock(map); return (KERN_RESOURCE_SHORTAGE); } entry->cred = cred; } vm_object_shadow(&entry->object.vm_object, &entry->offset, size); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; eobject = entry->object.vm_object; if (eobject->cred != NULL) { /* * The object was not shadowed. */ swap_release_by_cred(size, entry->cred); crfree(entry->cred); entry->cred = NULL; } else if (entry->cred != NULL) { VM_OBJECT_WLOCK(eobject); eobject->cred = entry->cred; eobject->charge = size; VM_OBJECT_WUNLOCK(eobject); entry->cred = NULL; } vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, atop(size)); entry->offset = 0; if (entry->cred != NULL) { VM_OBJECT_WLOCK(entry->object.vm_object); entry->object.vm_object->cred = entry->cred; entry->object.vm_object->charge = size; VM_OBJECT_WUNLOCK(entry->object.vm_object); entry->cred = NULL; } vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_locked: * * Lookup the faulting address. A version of vm_map_lookup that returns * KERN_FAILURE instead of blocking on map lock or memory allocation. */ int vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; /* * Lookup the faulting address. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) return (KERN_INVALID_ADDRESS); entry = *out_entry; /* * Fail if the entry refers to a submap. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) return (KERN_FAILURE); /* * Check whether this task is allowed to have this page. */ prot = entry->protection; fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type) return (KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) fault_type = entry->protection; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * Fail if the entry was copy-on-write for a write fault. */ if (fault_type & VM_PROT_WRITE) return (KERN_FAILURE); /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } /* * Fail if an object should be created. */ if (entry->object.vm_object == NULL && !map->system_map) return (KERN_FAILURE); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; *out_prot = prot; return (KERN_SUCCESS); } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } vm_offset_t vm_map_max_KBI(const struct vm_map *map) { return (vm_map_max(map)); } vm_offset_t vm_map_min_KBI(const struct vm_map *map) { return (vm_map_min(map)); } pmap_t vm_map_pmap_KBI(vm_map_t map) { return (map->pmap); } bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end) { return (vm_map_range_valid(map, start, end)); } #include "opt_ddb.h" #ifdef DDB #include #include static void vm_map_print(vm_map_t map) { vm_map_entry_t entry; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); db_indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", (void *)entry, (void *)entry->start, (void *)entry->end, entry->eflags); { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char)entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); if ((entry->prev == &map->header) || (entry->prev->object.sub_map != entry->object.sub_map)) { db_indent += 2; vm_map_print((vm_map_t)entry->object.sub_map); db_indent -= 2; } } else { if (entry->cred != NULL) db_printf(", ruid %d", entry->cred->cr_ruid); db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->object.vm_object && entry->object.vm_object->cred) db_printf(", obj ruid %d charge %jx", entry->object.vm_object->cred->cr_ruid, (uintmax_t)entry->object.vm_object->charge); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); if ((entry->prev == &map->header) || (entry->prev->object.vm_object != entry->object.vm_object)) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, 0, 0, (char *)0); db_indent -= 2; } } } db_indent -= 2; } DB_SHOW_COMMAND(map, map) { if (!have_addr) { db_printf("usage: show map \n"); return; } vm_map_print((vm_map_t)addr); } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = db_lookup_proc(addr); } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((vm_map_t)&p->p_vmspace->vm_map); } #endif /* DDB */ Index: stable/12/sys/vm/vm_map.h =================================================================== --- stable/12/sys/vm/vm_map.h (revision 365006) +++ stable/12/sys/vm/vm_map.h (revision 365007) @@ -1,445 +1,450 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_map.h 8.9 (Berkeley) 5/17/95 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory map module definitions. */ #ifndef _VM_MAP_ #define _VM_MAP_ #include #include #include /* * Types defined: * * vm_map_t the high-level address map data structure. * vm_map_entry_t an entry in an address map. */ typedef u_char vm_flags_t; typedef u_int vm_eflags_t; /* * Objects which live in maps may be either VM objects, or * another map (called a "sharing map") which denotes read-write * sharing with other maps. */ union vm_map_object { struct vm_object *vm_object; /* object object */ struct vm_map *sub_map; /* belongs to another map */ }; /* * Address map entries consist of start and end addresses, * a VM object (or sharing map) and offset into that object, * and user-exported inheritance and protection information. * Also included is control information for virtual copy operations. */ struct vm_map_entry { struct vm_map_entry *prev; /* previous entry */ struct vm_map_entry *next; /* next entry */ struct vm_map_entry *left; /* left child in binary search tree */ struct vm_map_entry *right; /* right child in binary search tree */ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ vm_offset_t next_read; /* vaddr of the next sequential read */ vm_size_t pad_adj_free; /* pad */ vm_size_t max_free; /* max free space in subtree */ union vm_map_object object; /* object I point to */ vm_ooffset_t offset; /* offset into object */ vm_eflags_t eflags; /* map entry flags */ vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ vm_inherit_t inheritance; /* inheritance */ uint8_t read_ahead; /* pages in the read-ahead window */ int wired_count; /* can be paged if = 0 */ struct ucred *cred; /* tmp storage for creator ref */ struct thread *wiring_thread; }; #define MAP_ENTRY_NOSYNC 0x00000001 #define MAP_ENTRY_IS_SUB_MAP 0x00000002 #define MAP_ENTRY_COW 0x00000004 #define MAP_ENTRY_NEEDS_COPY 0x00000008 #define MAP_ENTRY_NOFAULT 0x00000010 #define MAP_ENTRY_USER_WIRED 0x00000020 #define MAP_ENTRY_BEHAV_NORMAL 0x00000000 /* default behavior */ #define MAP_ENTRY_BEHAV_SEQUENTIAL 0x00000040 /* expect sequential access */ #define MAP_ENTRY_BEHAV_RANDOM 0x00000080 /* expect random access */ #define MAP_ENTRY_BEHAV_RESERVED 0x000000c0 /* future use */ #define MAP_ENTRY_BEHAV_MASK 0x000000c0 #define MAP_ENTRY_IN_TRANSITION 0x00000100 /* entry being changed */ #define MAP_ENTRY_NEEDS_WAKEUP 0x00000200 /* waiters in transition */ #define MAP_ENTRY_NOCOREDUMP 0x00000400 /* don't include in a core */ #define MAP_ENTRY_VN_EXEC 0x00000800 /* text vnode mapping */ #define MAP_ENTRY_GROWS_DOWN 0x00001000 /* top-down stacks */ #define MAP_ENTRY_GROWS_UP 0x00002000 /* bottom-up stacks */ #define MAP_ENTRY_WIRE_SKIPPED 0x00004000 #define MAP_ENTRY_WRITECNT 0x00008000 /* tracked writeable mapping */ #define MAP_ENTRY_GUARD 0x00010000 #define MAP_ENTRY_STACK_GAP_DN 0x00020000 #define MAP_ENTRY_STACK_GAP_UP 0x00040000 #define MAP_ENTRY_HEADER 0x00080000 #ifdef _KERNEL static __inline u_char vm_map_entry_behavior(vm_map_entry_t entry) { return (entry->eflags & MAP_ENTRY_BEHAV_MASK); } static __inline int vm_map_entry_user_wired_count(vm_map_entry_t entry) { if (entry->eflags & MAP_ENTRY_USER_WIRED) return (1); return (0); } static __inline int vm_map_entry_system_wired_count(vm_map_entry_t entry) { return (entry->wired_count - vm_map_entry_user_wired_count(entry)); } #endif /* _KERNEL */ /* * A map is a set of map entries. These map entries are * organized both as a binary search tree and as a doubly-linked * list. Both structures are ordered based upon the start and * end addresses contained within each map entry. * * Sleator and Tarjan's top-down splay algorithm is employed to * control height imbalance in the binary search tree. * * The map's min offset value is stored in map->header.end, and * its max offset value is stored in map->header.start. These * values act as sentinels for any forward or backward address * scan of the list. The map header has a special value for the * eflags field, MAP_ENTRY_HEADER, that is set initially, is * never changed, and prevents an eflags match of the header * with any other map entry. * * List of locks * (c) const until freed */ struct vm_map { struct vm_map_entry header; /* List of entries */ struct sx lock; /* Lock for map data */ struct mtx system_mtx; int nentries; /* Number of entries */ vm_size_t size; /* virtual size */ u_int timestamp; /* Version number */ u_char needs_wakeup; u_char system_map; /* (c) Am I a system map? */ vm_flags_t flags; /* flags for this vm_map */ vm_map_entry_t root; /* Root of a binary search tree */ pmap_t pmap; /* (c) Physical map */ vm_offset_t anon_loc; int busy; }; /* * vm_flags_t values */ #define MAP_WIREFUTURE 0x01 /* wire all future pages */ #define MAP_BUSY_WAKEUP 0x02 #define MAP_IS_SUB_MAP 0x04 /* has parent */ #define MAP_ASLR 0x08 /* enabled ASLR */ #define MAP_ASLR_IGNSTART 0x10 #ifdef _KERNEL #if defined(KLD_MODULE) && !defined(KLD_TIED) #define vm_map_max(map) vm_map_max_KBI((map)) #define vm_map_min(map) vm_map_min_KBI((map)) #define vm_map_pmap(map) vm_map_pmap_KBI((map)) #define vm_map_range_valid(map, start, end) \ vm_map_range_valid_KBI((map), (start), (end)) #else static __inline vm_offset_t vm_map_max(const struct vm_map *map) { return (map->header.start); } static __inline vm_offset_t vm_map_min(const struct vm_map *map) { return (map->header.end); } static __inline pmap_t vm_map_pmap(vm_map_t map) { return (map->pmap); } static __inline void vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear) { map->flags = (map->flags | set) & ~clear; } static inline bool vm_map_range_valid(vm_map_t map, vm_offset_t start, vm_offset_t end) { if (end < start) return (false); if (start < vm_map_min(map) || end > vm_map_max(map)) return (false); return (true); } #endif /* KLD_MODULE */ #endif /* _KERNEL */ /* * Shareable process virtual address space. * * List of locks * (c) const until freed */ struct vmspace { struct vm_map vm_map; /* VM address map */ struct shmmap_state *vm_shm; /* SYS5 shared memory private data XXX */ segsz_t vm_swrss; /* resident set size before last swap */ segsz_t vm_tsize; /* text size (pages) XXX */ segsz_t vm_dsize; /* data size (pages) XXX */ segsz_t vm_ssize; /* stack size (pages) */ caddr_t vm_taddr; /* (c) user virtual address of text */ caddr_t vm_daddr; /* (c) user virtual address of data */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ volatile int vm_refcnt; /* number of references */ /* * Keep the PMAP last, so that CPU-specific variations of that * structure on a single architecture don't result in offset * variations of the machine-independent fields in the vmspace. */ struct pmap vm_pmap; /* private physical map */ }; #ifdef _KERNEL static __inline pmap_t vmspace_pmap(struct vmspace *vmspace) { return &vmspace->vm_pmap; } #endif /* _KERNEL */ #ifdef _KERNEL /* * Macros: vm_map_lock, etc. * Function: * Perform locking on the data portion of a map. Note that * these macros mimic procedure calls returning void. The * semicolon is supplied by the user of these macros, not * by the macros themselves. The macros can safely be used * as unbraced elements in a higher level statement. */ void _vm_map_lock(vm_map_t map, const char *file, int line); void _vm_map_unlock(vm_map_t map, const char *file, int line); int _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line); void _vm_map_lock_read(vm_map_t map, const char *file, int line); void _vm_map_unlock_read(vm_map_t map, const char *file, int line); int _vm_map_trylock(vm_map_t map, const char *file, int line); int _vm_map_trylock_read(vm_map_t map, const char *file, int line); int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line); void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line); int vm_map_locked(vm_map_t map); void vm_map_wakeup(vm_map_t map); void vm_map_busy(vm_map_t map); void vm_map_unbusy(vm_map_t map); void vm_map_wait_busy(vm_map_t map); vm_offset_t vm_map_max_KBI(const struct vm_map *map); vm_offset_t vm_map_min_KBI(const struct vm_map *map); pmap_t vm_map_pmap_KBI(vm_map_t map); bool vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end); #define vm_map_lock(map) _vm_map_lock(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock(map) _vm_map_unlock(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock_and_wait(map, timo) \ _vm_map_unlock_and_wait(map, timo, LOCK_FILE, LOCK_LINE) #define vm_map_lock_read(map) _vm_map_lock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock_read(map) _vm_map_unlock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_trylock(map) _vm_map_trylock(map, LOCK_FILE, LOCK_LINE) #define vm_map_trylock_read(map) \ _vm_map_trylock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_upgrade(map) \ _vm_map_lock_upgrade(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_downgrade(map) \ _vm_map_lock_downgrade(map, LOCK_FILE, LOCK_LINE) long vmspace_resident_count(struct vmspace *vmspace); #endif /* _KERNEL */ + +/* XXX: number of kernel maps to statically allocate */ +#define MAX_KMAP 10 + /* * Copy-on-write flags for vm_map operations */ #define MAP_INHERIT_SHARE 0x00000001 #define MAP_COPY_ON_WRITE 0x00000002 #define MAP_NOFAULT 0x00000004 #define MAP_PREFAULT 0x00000008 #define MAP_PREFAULT_PARTIAL 0x00000010 #define MAP_DISABLE_SYNCER 0x00000020 #define MAP_CHECK_EXCL 0x00000040 #define MAP_CREATE_GUARD 0x00000080 #define MAP_DISABLE_COREDUMP 0x00000100 #define MAP_PREFAULT_MADVISE 0x00000200 /* from (user) madvise request */ #define MAP_WRITECOUNT 0x00000400 #define MAP_REMAP 0x00000800 #define MAP_STACK_GROWS_DOWN 0x00001000 #define MAP_STACK_GROWS_UP 0x00002000 #define MAP_ACC_CHARGED 0x00004000 #define MAP_ACC_NO_CHARGE 0x00008000 #define MAP_CREATE_STACK_GAP_UP 0x00010000 #define MAP_CREATE_STACK_GAP_DN 0x00020000 #define MAP_VN_EXEC 0x00040000 /* * vm_fault option flags */ #define VM_FAULT_NORMAL 0 /* Nothing special */ #define VM_FAULT_WIRE 1 /* Wire the mapped page */ #define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */ /* * Initially, mappings are slightly sequential. The maximum window size must * account for the map entry's "read_ahead" field being defined as an uint8_t. */ #define VM_FAULT_READ_AHEAD_MIN 7 #define VM_FAULT_READ_AHEAD_INIT 15 #define VM_FAULT_READ_AHEAD_MAX min(atop(MAXPHYS) - 1, UINT8_MAX) /* * The following "find_space" options are supported by vm_map_find(). * * For VMFS_ALIGNED_SPACE, the desired alignment is specified to * the macro argument as log base 2 of the desired alignment. */ #define VMFS_NO_SPACE 0 /* don't find; use the given range */ #define VMFS_ANY_SPACE 1 /* find a range with any alignment */ #define VMFS_OPTIMAL_SPACE 2 /* find a range with optimal alignment*/ #define VMFS_SUPER_SPACE 3 /* find a superpage-aligned range */ #define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find a range with fixed alignment */ /* * vm_map_wire and vm_map_unwire option flags */ #define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ #define VM_MAP_WIRE_USER 1 /* wiring in a user map */ #define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ #define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ #define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ #ifdef _KERNEL boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); +vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t); int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); vm_offset_t vm_map_findspace(vm_map_t, vm_offset_t, vm_size_t); int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t); void vm_map_init(vm_map_t, pmap_t, vm_offset_t, vm_offset_t); int vm_map_insert (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int); int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *); int vm_map_lookup_locked(vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *); void vm_map_lookup_done (vm_map_t, vm_map_entry_t); boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *); int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t); int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t); void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry); void vm_map_startup (void); int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t); int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int); int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); long vmspace_swap_count(struct vmspace *vmspace); void vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add); #endif /* _KERNEL */ #endif /* _VM_MAP_ */ Index: stable/12/sys/vm/vm_page.c =================================================================== --- stable/12/sys/vm/vm_page.c (revision 365006) +++ stable/12/sys/vm/vm_page.c (revision 365007) @@ -1,4586 +1,4589 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int uma_startup_count(int); extern void uma_startup(void *, int); extern int vmem_startup_count(void); struct vm_domain vm_dom[MAXMEMDOM]; DPCPU_DEFINE_STATIC(struct vm_batchqueue, pqbatch[MAXMEMDOM][PQ_COUNT]); struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT]; struct mtx_padalign __exclusive_cache_line vm_domainset_lock; /* The following fields are protected by the domainset lock. */ domainset_t __exclusive_cache_line vm_min_domains; domainset_t __exclusive_cache_line vm_severe_domains; static int vm_min_waiters; static int vm_severe_waiters; static int vm_pageproc_waiters; /* * bogus page -- for I/O to/from partially complete buffers, * or for paging into sparsely invalid regions. */ vm_page_t bogus_page; vm_page_t vm_page_array; long vm_page_array_size; long first_page; static int boot_pages; SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static TAILQ_HEAD(, vm_page) blacklist_head; static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages"); static uma_zone_t fakepg_zone; static void vm_page_alloc_check(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); static void vm_page_dequeue_complete(vm_page_t m); static void vm_page_enqueue(vm_page_t m, uint8_t queue); static void vm_page_init(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high); static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req); static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags); static void vm_page_zone_release(void *arg, void **store, int cnt); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init, NULL); static void vm_page_init(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); } /* * The cache page zone is initialized later since we need to be able to allocate * pages before UMA is fully initialized. */ static void vm_page_init_cache_zones(void *dummy __unused) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int domain, pool; for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); /* * Don't allow the page caches to take up more than .1875% of * memory. A UMA bucket contains at most 256 free pages, and we * have two buckets per CPU per free pool. */ if (vmd->vmd_page_count / 600 < 2 * 256 * mp_ncpus * VM_NFREEPOOL) continue; for (pool = 0; pool < VM_NFREEPOOL; pool++) { pgcache = &vmd->vmd_pgcache[pool]; pgcache->domain = domain; pgcache->pool = pool; pgcache->zone = uma_zcache_create("vm pgcache", sizeof(struct vm_page), NULL, NULL, NULL, NULL, vm_page_zone_import, vm_page_zone_release, pgcache, UMA_ZONE_NOBUCKETCACHE | UMA_ZONE_MAXBUCKET | UMA_ZONE_VM); } } } SYSINIT(vm_page2, SI_SUB_VM_CONF, SI_ORDER_ANY, vm_page_init_cache_zones, NULL); /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_next: * * Find the next entry in the provided string of blacklist * addresses. Entries are separated by space, comma, or newline. * If an invalid integer is encountered then the rest of the * string is skipped. Updates the list pointer to the next * character, or NULL if the string is exhausted or invalid. */ static vm_paddr_t vm_page_blacklist_next(char **list, char *end) { vm_paddr_t bad; char *cp, *pos; if (list == NULL || *list == NULL) return (0); if (**list =='\0') { *list = NULL; return (0); } /* * If there's no end pointer then the buffer is coming from * the kenv and we know it's null-terminated. */ if (end == NULL) end = *list + strlen(*list); /* Ensure that strtoq() won't walk off the end */ if (*end != '\0') { if (*end == '\n' || *end == ' ' || *end == ',') *end = '\0'; else { printf("Blacklist not terminated, skipping\n"); *list = NULL; return (0); } } for (pos = *list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') { if (bad == 0) { if (++cp < end) continue; else break; } } else break; if (*cp == '\0' || ++cp >= end) *list = NULL; else *list = cp; return (trunc_page(bad)); } printf("Garbage in RAM blacklist, skipping\n"); *list = NULL; return (0); } bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose) { struct vm_domain *vmd; vm_page_t m; int ret; m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) return (true); /* page does not exist, no failure */ vmd = vm_pagequeue_domain(m); vm_domain_free_lock(vmd); ret = vm_phys_unfree_page(m); vm_domain_free_unlock(vmd); if (ret != 0) { vm_domain_freecnt_inc(vmd, -1); TAILQ_INSERT_TAIL(&blacklist_head, m, listq); if (verbose) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); } return (ret); } /* * vm_page_blacklist_check: * * Iterate through the provided string of blacklist addresses, pulling * each entry out of the physical allocator free list and putting it * onto a list for reporting via the vm.page_blacklist sysctl. */ static void vm_page_blacklist_check(char *list, char *end) { vm_paddr_t pa; char *next; next = list; while (next != NULL) { if ((pa = vm_page_blacklist_next(&next, end)) == 0) continue; vm_page_blacklist_add(pa, bootverbose); } } /* * vm_page_blacklist_load: * * Search for a special module named "ram_blacklist". It'll be a * plain text file provided by the user via the loader directive * of the same name. */ static void vm_page_blacklist_load(char **list, char **end) { void *mod; u_char *ptr; u_int len; mod = NULL; ptr = NULL; mod = preload_search_by_type("ram_blacklist"); if (mod != NULL) { ptr = preload_fetch_addr(mod); len = preload_fetch_size(mod); } *list = ptr; if (ptr != NULL) *end = ptr + len; else *end = NULL; return; } static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS) { vm_page_t m; struct sbuf sbuf; int error, first; first = 1; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); TAILQ_FOREACH(m, &blacklist_head, listq) { sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",", (uintmax_t)m->phys_addr); first = 0; } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * Initialize a dummy page for use in scans of the specified paging queue. * In principle, this function only needs to set the flag PG_MARKER. * Nonetheless, it write busies and initializes the hold count to one as * safety precautions. */ static void vm_page_init_marker(vm_page_t marker, int queue, uint8_t aflags) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->aflags = aflags; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } static void vm_page_domain_init(int domain) { struct vm_domain *vmd; struct vm_pagequeue *pq; int i; vmd = VM_DOMAIN(domain); bzero(vmd, sizeof(*vmd)); *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) = "vm laundry pagequeue"; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_name) = "vm unswappable pagequeue"; vmd->vmd_domain = domain; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); pq->pq_pdpages = 0; vm_page_init_marker(&vmd->vmd_markers[i], i, 0); } mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF); mtx_init(&vmd->vmd_pageout_mtx, "vm pageout lock", NULL, MTX_DEF); snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain); /* * inacthead is used to provide FIFO ordering for LRU-bypassing * insertions. */ vm_page_init_marker(&vmd->vmd_inacthead, PQ_INACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_INACTIVE].pq_pl, &vmd->vmd_inacthead, plinks.q); /* * The clock pages are used to implement active queue scanning without * requeues. Scans start at clock[0], which is advanced after the scan * ends. When the two clock hands meet, they are reset and scanning * resumes from the head of the queue. */ vm_page_init_marker(&vmd->vmd_clock[0], PQ_ACTIVE, PGA_ENQUEUED); vm_page_init_marker(&vmd->vmd_clock[1], PQ_ACTIVE, PGA_ENQUEUED); TAILQ_INSERT_HEAD(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&vmd->vmd_pagequeues[PQ_ACTIVE].pq_pl, &vmd->vmd_clock[1], plinks.q); } /* * Initialize a physical page in preparation for adding it to the free * lists. */ static void vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind) { m->object = NULL; m->wire_count = 0; m->busy_lock = VPB_UNBUSIED; m->hold_count = 0; m->flags = m->aflags = 0; m->phys_addr = pa; m->queue = PQ_NONE; m->psind = 0; m->segind = segind; m->order = VM_NFREEORDER; m->pool = VM_FREEPOOL_DEFAULT; m->valid = m->dirty = 0; pmap_page_init(m); } /* * vm_page_startup: * * Initializes the resident memory module. Allocates physical memory for * bootstrapping UMA and some data structures that are used to manage * physical pages. Initializes these structures, and populates the free * page queues. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { struct vm_phys_seg *seg; vm_page_t m; char *list, *listend; vm_offset_t mapped; vm_paddr_t end, high_avail, low_avail, new_end, page_range, size; vm_paddr_t biggestsize, last_pa, pa; u_long pagecount; int biggestone, i, segind; #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } } end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_domainset_lock, "vm domainset lock", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(i); /* * Allocate memory for use when boot strapping the kernel memory * allocator. Tell UMA how many zones we are going to create * before going fully functional. UMA will add its zones. * * VM startup zones: vmem, vmem_btag, VM OBJECT, RADIX NODE, MAP, * KMAP ENTRY, MAP ENTRY, VMSPACE. */ boot_pages = uma_startup_count(8); #ifndef UMA_MD_SMALL_ALLOC /* vmem_startup() calls uma_prealloc(). */ boot_pages += vmem_startup_count(); + /* vm_map_startup() calls uma_prealloc(). */ + boot_pages += howmany(MAX_KMAP, + UMA_SLAB_SPACE / sizeof(struct vm_map)); /* * Before going fully functional kmem_init() does allocation * from "KMAP ENTRY" and vmem_create() does allocation from "vmem". */ boot_pages += 2; #endif /* * CTFLAG_RDTUN doesn't work during the early boot process, so we must * manually fetch the value. */ TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages); new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #ifdef WITNESS end = new_end; new_end = end - round_page(witness_startup_count()); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); witness_startup((void *)mapped); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ defined(__i386__) || defined(__mips__) || defined(__riscv) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #else (void)last_pa; #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) /* * Include the UMA bootstrap pages and vm_page_dump in a crash dump. * When pmap_map() uses the direct map, they are not automatically * included. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use, taking into account the overhead of a page structure per page. * In other words, solve * "available physical memory" - round_page(page_range * * sizeof(struct vm_page)) = page_range * PAGE_SIZE * for page_range. */ low_avail = phys_avail[0]; high_avail = phys_avail[1]; for (i = 0; i < vm_phys_nsegs; i++) { if (vm_phys_segs[i].start < low_avail) low_avail = vm_phys_segs[i].start; if (vm_phys_segs[i].end > high_avail) high_avail = vm_phys_segs[i].end; } /* Skip the first chunk. It is already accounted for. */ for (i = 2; phys_avail[i + 1] != 0; i += 2) { if (phys_avail[i] < low_avail) low_avail = phys_avail[i]; if (phys_avail[i + 1] > high_avail) high_avail = phys_avail[i + 1]; } first_page = low_avail / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE size = 0; for (i = 0; i < vm_phys_nsegs; i++) size += vm_phys_segs[i].end - vm_phys_segs[i].start; for (i = 0; phys_avail[i + 1] != 0; i += 2) size += phys_avail[i + 1] - phys_avail[i]; #elif defined(VM_PHYSSEG_DENSE) size = high_avail - low_avail; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif #ifdef VM_PHYSSEG_DENSE /* * In the VM_PHYSSEG_DENSE case, the number of pages can account for * the overhead of a page structure per page only if vm_page_array is * allocated from the last physical memory chunk. Otherwise, we must * allocate page structures representing the physical memory * underlying vm_page_array, even though they will not be used. */ if (new_end != high_avail) page_range = size / PAGE_SIZE; else #endif { page_range = size / (PAGE_SIZE + sizeof(struct vm_page)); /* * If the partial bytes remaining are large enough for * a page (PAGE_SIZE) without a corresponding * 'struct vm_page', then new_end will contain an * extra page after subtracting the length of the VM * page array. Compensate by subtracting an extra * page from new_end. */ if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) { if (new_end == high_avail) high_avail -= PAGE_SIZE; new_end -= PAGE_SIZE; } } end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. * However, because this page is allocated from KVM, out-of-bounds * accesses using the direct map will not be trapped. */ vaddr += PAGE_SIZE; /* * Allocate physical memory for the page structures, and map it. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t)mapped; vm_page_array_size = page_range; #if VM_NRESERVLEVEL > 0 /* * Allocate physical memory for the reservation management system's * data structures, and map it. */ if (high_avail == end) high_avail = new_end; new_end = vm_reserv_startup(&vaddr, new_end, high_avail); #endif #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__) || \ defined(__riscv) /* * Include vm_page_array and vm_reserv_array in a crash dump. */ for (pa = new_end; pa < end; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Add physical memory segments corresponding to the available * physical pages. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]); /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. */ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) for (ii = 0; ii < vm_page_array_size; ii++) { m = &vm_page_array[ii]; vm_page_init_page(m, (first_page + ii) << PAGE_SHIFT, 0); m->flags = PG_FICTITIOUS; } #endif vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; for (m = seg->first_page, pa = seg->start; pa < seg->end; m++, pa += PAGE_SIZE) vm_page_init_page(m, pa, segind); /* * Add the segment to the free lists only if it is covered by * one of the ranges in phys_avail. Because we've added the * ranges to the vm_phys_segs array, we can assume that each * segment is either entirely contained in one of the ranges, * or doesn't overlap any of them. */ for (i = 0; phys_avail[i + 1] != 0; i += 2) { struct vm_domain *vmd; if (seg->start < phys_avail[i] || seg->end > phys_avail[i + 1]) continue; m = seg->first_page; pagecount = (u_long)atop(seg->end - seg->start); vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_free_contig(m, pagecount); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, pagecount); vm_cnt.v_page_count += (u_int)pagecount; vmd = VM_DOMAIN(seg->domain); vmd->vmd_page_count += (u_int)pagecount; vmd->vmd_segs |= 1UL << m->segind; break; } } /* * Remove blacklisted pages from the physical memory allocator. */ TAILQ_INIT(&blacklist_head); vm_page_blacklist_load(&list, &listend); vm_page_blacklist_check(list, listend); list = kern_getenv("vm.blacklist"); vm_page_blacklist_check(list, NULL); freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; bool locked; vm_page_assert_xbusied(m); locked = mtx_owned(vm_page_lockptr(m)); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (x != 0 && !locked) vm_page_lock(m); if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) break; if (x != 0 && !locked) vm_page_unlock(m); } if (x != 0) { wakeup(m); if (!locked) vm_page_unlock(m); } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_NOTOWNED); vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. * * If nonshared is true, sleep only if the page is xbusy. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { u_int x; vm_page_assert_locked(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } static void vm_page_xunbusy_locked(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_assert_locked(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); /* There is a waiter, do wakeup() instead of vm_page_flash(). */ wakeup(m); } void vm_page_xunbusy_maybelocked(vm_page_t m) { bool lockacq; vm_page_assert_xbusied(m); /* * Fast path for unbusy. If it succeeds, we know that there * are no waiters, so we do not need a wakeup. */ if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED)) return; lockacq = !mtx_owned(vm_page_lockptr(m)); if (lockacq) vm_page_lock(m); vm_page_xunbusy_locked(m); if (lockacq) vm_page_unlock(m); } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); vm_page_xunbusy_locked(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Avoid releasing and reacquiring the same page lock. */ void vm_page_change_lock(vm_page_t m, struct mtx **mtx) { struct mtx *mtx1; mtx1 = vm_page_lockptr(m); if (*mtx == mtx1) return; if (*mtx != NULL) mtx_unlock(*mtx); *mtx = mtx1; mtx_lock(mtx1); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx; mtx = NULL; for (; count != 0; count--) { vm_page_change_lock(*ma, &mtx); vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from a getpages request that * was optionally read ahead or behind. */ void vm_page_readahead_finish(vm_page_t m) { /* We shouldn't put invalid pages on queues. */ KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m)); /* * Since the page is not the actually needed one, whether it should * be activated or deactivated is not obvious. Empirical results * have shown that deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* Refer to this operation by its public name. */ KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = NULL; m->pindex = 0; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the specified page from its containing object, but does not * invalidate any backing storage. Return true if the page may be safely * freed and false otherwise. * * The object must be locked. The page must be locked if it is managed. */ bool vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t mrem; object = m->object; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) vm_page_xunbusy_maybelocked(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; return (!vm_page_wired(m)); } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_LOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL) { MPASS(next->object == m->object); if (next->pindex != m->pindex + 1) next = NULL; } return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_LOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) { MPASS(prev->object == m->object); if (prev->pindex != m->pindex - 1) prev = NULL; } return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(mnew->object == NULL, ("vm_page_replace: page %p already in object", mnew)); KASSERT(mnew->queue == PQ_NONE, ("vm_page_replace: new page %p is on a paging queue", mnew)); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: old page %p is on a paging queue", mold)); /* Keep the resident page list in sorted order. */ TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq); TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy_maybelocked(mold); /* * The object's resident_page_count does not change because we have * swapped one page for another, but OBJ_MIGHTBEDIRTY. */ if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); (void)vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { return (vm_page_alloc_after(object, pindex, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } vm_page_t vm_page_alloc_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req) { return (vm_page_alloc_domain_after(object, pindex, domain, req, object != NULL ? vm_radix_lookup_le(&object->rtree, pindex) : NULL)); } /* * Allocate a page in the specified object with the given page index. To * optimize insertion of the page into the object, the caller must also specifiy * the resident page in the object with largest index smaller than the given * page index, or NULL if no such page exists. */ vm_page_t vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req, vm_page_t mpred) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_domain_after(object, pindex, domain, req, mpred); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } /* * Returns true if the number of free pages exceeds the minimum * for the request class and false otherwise. */ static int _vm_domain_allocate(struct vm_domain *vmd, int req_class, int npages) { u_int limit, old, new; if (req_class == VM_ALLOC_INTERRUPT) limit = 0; else if (req_class == VM_ALLOC_SYSTEM) limit = vmd->vmd_interrupt_free_min; else limit = vmd->vmd_free_reserved; /* * Attempt to reserve the pages. Fail if we're below the limit. */ limit += npages; old = vmd->vmd_free_count; do { if (old < limit) return (0); new = old - npages; } while (atomic_fcmpset_int(&vmd->vmd_free_count, &old, new) == 0); /* Wake the page daemon if we've crossed the threshold. */ if (vm_paging_needed(vmd, new) && !vm_paging_needed(vmd, old)) pagedaemon_wakeup(vmd->vmd_domain); /* Only update bitsets on transitions. */ if ((old >= vmd->vmd_free_min && new < vmd->vmd_free_min) || (old >= vmd->vmd_free_severe && new < vmd->vmd_free_severe)) vm_domain_set(vmd); return (1); } int vm_domain_allocate(struct vm_domain *vmd, int req, int npages) { int req_class; /* * The page daemon is allowed to dig deeper into the free page list. */ req_class = req & VM_ALLOC_CLASS_MASK; if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; return (_vm_domain_allocate(vmd, req_class, npages)); } vm_page_t vm_page_alloc_domain_after(vm_object_t object, vm_pindex_t pindex, int domain, int req, vm_page_t mpred) { struct vm_domain *vmd; vm_page_t m; int flags, pool; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); KASSERT(mpred == NULL || mpred->pindex < pindex, ("mpred %p doesn't precede pindex 0x%jx", mpred, (uintmax_t)pindex)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); flags = 0; m = NULL; pool = object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the page from a reservation? */ if (vm_object_reserv(object) && ((m = vm_reserv_extend(req, object, pindex, domain, mpred)) != NULL || (m = vm_reserv_alloc_page(req, object, pindex, domain, mpred)) != NULL)) { domain = vm_phys_domain(m); vmd = VM_DOMAIN(domain); goto found; } #endif vmd = VM_DOMAIN(domain); if (vmd->vmd_pgcache[pool].zone != NULL) { m = uma_zalloc(vmd->vmd_pgcache[pool].zone, M_NOWAIT); if (m != NULL) { flags |= PG_PCPU_CACHE; goto found; } } if (vm_domain_allocate(vmd, req, 1)) { /* * If not, allocate it from the free page queues. */ vm_domain_free_lock(vmd); m = vm_phys_alloc_pages(domain, pool, 0); vm_domain_free_unlock(vmd); if (m == NULL) { vm_domain_freecnt_inc(vmd, 1); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_inactive(domain)) goto again; #endif } } if (m == NULL) { /* * Not allocatable, give up. */ if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } /* * At this point we had better have found a good page. */ found: vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ if ((req & VM_ALLOC_ZERO) != 0) flags |= (m->flags & PG_ZERO); if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ vm_wire_add(1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if (req & VM_ALLOC_WIRED) { vm_wire_sub(1); m->wire_count = 0; } KASSERT(m->object == NULL, ("page %p has object", m)); m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; return (m); } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The specified object may not contain fictitious pages. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, object, pindex, &domain, &req); do { m = vm_page_alloc_contig_domain(object, pindex, domain, req, npages, low, high, alignment, boundary, memattr); if (m != NULL) break; } while (vm_domainset_iter_page(&di, object, &domain) == 0); return (m); } vm_page_t vm_page_alloc_contig_domain(vm_object_t object, vm_pindex_t pindex, int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vm_domain *vmd; vm_page_t m, m_ret, mpred; u_int busy_lock, flags, oflags; mpred = NULL; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object, req)); KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0, ("Can't sleep and retry object insertion.")); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_FICTITIOUS) == 0, ("vm_page_alloc_contig: object %p has fictitious pages", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc_contig: pindex already allocated")); } /* * Can we allocate the pages without the number of free pages falling * below the lower bound for the allocation class? */ m_ret = NULL; again: #if VM_NRESERVLEVEL > 0 /* * Can we allocate the pages from a reservation? */ if (vm_object_reserv(object) && ((m_ret = vm_reserv_extend_contig(req, object, pindex, domain, npages, low, high, alignment, boundary, mpred)) != NULL || (m_ret = vm_reserv_alloc_contig(req, object, pindex, domain, npages, low, high, alignment, boundary, mpred)) != NULL)) { domain = vm_phys_domain(m_ret); vmd = VM_DOMAIN(domain); goto found; } #endif vmd = VM_DOMAIN(domain); if (vm_domain_allocate(vmd, req, npages)) { /* * allocate them from the free page queues. */ vm_domain_free_lock(vmd); m_ret = vm_phys_alloc_contig(domain, npages, low, high, alignment, boundary); vm_domain_free_unlock(vmd); if (m_ret == NULL) { vm_domain_freecnt_inc(vmd, npages); #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(domain, npages, low, high, alignment, boundary)) goto again; #endif } } if (m_ret == NULL) { if (vm_domain_alloc_fail(vmd, object, req)) goto again; return (NULL); } #if VM_NRESERVLEVEL > 0 found: #endif for (m = m_ret; m < &m_ret[npages]; m++) { vm_page_dequeue(m); vm_page_alloc_check(m); } /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ? VPO_UNMANAGED : 0; busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) busy_lock = VPB_SHARERS_WORD(1); if ((req & VM_ALLOC_WIRED) != 0) vm_wire_add(npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = busy_lock; if ((req & VM_ALLOC_WIRED) != 0) m->wire_count = 1; m->act_count = 0; m->oflags = oflags; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { if ((req & VM_ALLOC_WIRED) != 0) vm_wire_sub(npages); KASSERT(m->object == NULL, ("page %p has object", m)); mpred = m; for (m = m_ret; m < &m_ret[npages]; m++) { if (m <= mpred && (req & VM_ALLOC_WIRED) != 0) m->wire_count = 0; m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_UNBUSIED; /* Don't change PG_ZERO. */ vm_page_free_toq(m); } if (req & VM_ALLOC_WAITFAIL) { VM_OBJECT_WUNLOCK(object); vm_radix_wait(); VM_OBJECT_WLOCK(object); } return (NULL); } mpred = m; } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } return (m_ret); } /* * Check a page that has been freshly dequeued from a freelist. */ static void vm_page_alloc_check(vm_page_t m) { KASSERT(m->object == NULL, ("page %p has object", m)); KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, ("page %p has unexpected queue %d, flags %#x", m, m->queue, (m->aflags & PGA_QUEUE_STATE_MASK))); KASSERT(!vm_page_held(m), ("page %p is held", m)); KASSERT(!vm_page_busied(m), ("page %p is busy", m)); KASSERT(m->dirty == 0, ("page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page */ vm_page_t vm_page_alloc_freelist(int freelist, int req) { struct vm_domainset_iter di; vm_page_t m; int domain; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { m = vm_page_alloc_freelist_domain(domain, freelist, req); if (m != NULL) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (m); } vm_page_t vm_page_alloc_freelist_domain(int domain, int freelist, int req) { struct vm_domain *vmd; vm_page_t m; u_int flags; m = NULL; vmd = VM_DOMAIN(domain); again: if (vm_domain_allocate(vmd, req, 1)) { vm_domain_free_lock(vmd); m = vm_phys_alloc_freelist_pages(domain, freelist, VM_FREEPOOL_DIRECT, 0); vm_domain_free_unlock(vmd); if (m == NULL) vm_domain_freecnt_inc(vmd, 1); } if (m == NULL) { if (vm_domain_alloc_fail(vmd, NULL, req)) goto again; return (NULL); } vm_page_dequeue(m); vm_page_alloc_check(m); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; if ((req & VM_ALLOC_WIRED) != 0) { /* * The page lock is not required for wiring a page that does * not belong to an object. */ vm_wire_add(1); m->wire_count = 1; } /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; return (m); } static int vm_page_zone_import(void *arg, void **store, int cnt, int domain, int flags) { struct vm_domain *vmd; struct vm_pgcache *pgcache; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); /* * The page daemon should avoid creating extra memory pressure since its * main purpose is to replenish the store of free pages. */ if (vmd->vmd_severeset || curproc == pageproc || !_vm_domain_allocate(vmd, VM_ALLOC_NORMAL, cnt)) return (0); domain = vmd->vmd_domain; vm_domain_free_lock(vmd); i = vm_phys_alloc_npages(domain, pgcache->pool, cnt, (vm_page_t *)store); vm_domain_free_unlock(vmd); if (cnt != i) vm_domain_freecnt_inc(vmd, cnt - i); return (i); } static void vm_page_zone_release(void *arg, void **store, int cnt) { struct vm_domain *vmd; struct vm_pgcache *pgcache; vm_page_t m; int i; pgcache = arg; vmd = VM_DOMAIN(pgcache->domain); vm_domain_free_lock(vmd); for (i = 0; i < cnt; i++) { m = (vm_page_t)store[i]; vm_phys_free_pages(m, 0); } vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } #define VPSC_ANY 0 /* No restrictions. */ #define VPSC_NORESERV 1 /* Skip reservations; implies VPSC_NOSUPER. */ #define VPSC_NOSUPER 2 /* Skip superpages. */ /* * vm_page_scan_contig: * * Scan vm_page_array[] between the specified entries "m_start" and * "m_end" for a run of contiguous physical pages that satisfy the * specified conditions, and return the lowest page in the run. The * specified "alignment" determines the alignment of the lowest physical * page in the run. If the specified "boundary" is non-zero, then the * run of physical pages cannot span a physical address that is a * multiple of "boundary". * * "m_end" is never dereferenced, so it need not point to a vm_page * structure within vm_page_array[]. * * "npages" must be greater than zero. "m_start" and "m_end" must not * span a hole (or discontiguity) in the physical address space. Both * "alignment" and "boundary" must be a power of two. */ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options) { struct mtx *m_mtx; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_run; #if VM_NRESERVLEVEL > 0 int level; #endif int m_inc, order, run_ext, run_len; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); m_run = NULL; run_len = 0; m_mtx = NULL; for (m = m_start; m < m_end && run_len < npages; m += m_inc) { KASSERT((m->flags & PG_MARKER) == 0, ("page %p is PG_MARKER", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1, ("fictitious page %p has invalid wire count", m)); /* * If the current page would be the start of a run, check its * physical address against the end, alignment, and boundary * conditions. If it doesn't satisfy these conditions, either * terminate the scan or advance to the next page that * satisfies the failed condition. */ if (run_len == 0) { KASSERT(m_run == NULL, ("m_run != NULL")); if (m + npages > m_end) break; pa = VM_PAGE_TO_PHYS(m); if ((pa & (alignment - 1)) != 0) { m_inc = atop(roundup2(pa, alignment) - pa); continue; } if (rounddown2(pa ^ (pa + ptoa(npages) - 1), boundary) != 0) { m_inc = atop(roundup2(pa, boundary) - pa); continue; } } else KASSERT(m_run != NULL, ("m_run == NULL")); vm_page_change_lock(m, &m_mtx); m_inc = 1; retry: if (vm_page_held(m)) run_ext = 0; #if VM_NRESERVLEVEL > 0 else if ((level = vm_reserv_level(m)) >= 0 && (options & VPSC_NORESERV) != 0) { run_ext = 0; /* Advance to the end of the reservation. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); } #endif else if ((object = m->object) != NULL) { /* * The page is considered eligible for relocation if * and only if it could be laundered or reclaimed by * the page daemon. */ if (!VM_OBJECT_TRYRLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_RLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_RUNLOCK(object); goto retry; } else if (vm_page_held(m)) { run_ext = 0; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) { run_ext = 0; #if VM_NRESERVLEVEL > 0 } else if ((options & VPSC_NOSUPER) != 0 && (level = vm_reserv_level_iffullpop(m)) >= 0) { run_ext = 0; /* Advance to the end of the superpage. */ pa = VM_PAGE_TO_PHYS(m); m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) - pa); #endif } else if (object->memattr == VM_MEMATTR_DEFAULT && vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { /* * The page is allocated but eligible for * relocation. Extend the current run by one * page. */ KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ run_ext = 1; } else run_ext = 0; unlock: VM_OBJECT_RUNLOCK(object); #if VM_NRESERVLEVEL > 0 } else if (level >= 0) { /* * The page is reserved but not yet allocated. In * other words, it is still free. Extend the current * run by one page. */ run_ext = 1; #endif } else if ((order = m->order) < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it is the * first page in a power-of-two-sized run of * contiguous free pages. Add these pages to the end * of the current run, and jump ahead. */ run_ext = 1 << order; m_inc = 1 << order; } else { /* * Skip the page for one of the following reasons: (1) * It is enqueued in the physical memory allocator's * free page queues. However, it is not the first * page in a run of contiguous free pages. (This case * rarely occurs because the scan is performed in * ascending order.) (2) It is not reserved, and it is * transitioning from free to allocated. (Conversely, * the transition from allocated to free for managed * pages is blocked by the page lock.) (3) It is * allocated but not contained by an object and not * wired, e.g., allocated by Xen's balloon driver. */ run_ext = 0; } /* * Extend or reset the current run of pages. */ if (run_ext > 0) { if (run_len == 0) m_run = m; run_len += run_ext; } else { if (run_len > 0) { m_run = NULL; run_len = 0; } } } if (m_mtx != NULL) mtx_unlock(m_mtx); if (run_len >= npages) return (m_run); return (NULL); } /* * vm_page_reclaim_run: * * Try to relocate each of the allocated virtual pages within the * specified run of physical pages to a new physical address. Free the * physical pages underlying the relocated virtual pages. A virtual page * is relocatable if and only if it could be laundered or reclaimed by * the page daemon. Whenever possible, a virtual page is relocated to a * physical address above "high". * * Returns 0 if every physical page within the run was already free or * just freed by a successful relocation. Otherwise, returns a non-zero * value indicating why the last attempt to relocate a virtual page was * unsuccessful. * * "req_class" must be an allocation class. */ static int vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run, vm_paddr_t high) { struct vm_domain *vmd; struct mtx *m_mtx; struct spglist free; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_end, m_new; int error, order, req; KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class, ("req_class is not an allocation class")); SLIST_INIT(&free); error = 0; m = m_run; m_end = m_run + npages; m_mtx = NULL; for (; error == 0 && m < m_end; m++) { KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0, ("page %p is PG_FICTITIOUS or PG_MARKER", m)); /* * Avoid releasing and reacquiring the same page lock. */ vm_page_change_lock(m, &m_mtx); retry: if (vm_page_held(m)) error = EBUSY; else if ((object = m->object) != NULL) { /* * The page is relocated if and only if it could be * laundered or reclaimed by the page daemon. */ if (!VM_OBJECT_TRYWLOCK(object)) { mtx_unlock(m_mtx); VM_OBJECT_WLOCK(object); mtx_lock(m_mtx); if (m->object != object) { /* * The page may have been freed. */ VM_OBJECT_WUNLOCK(object); goto retry; } else if (vm_page_held(m)) { error = EBUSY; goto unlock; } } KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("page %p is PG_UNHOLDFREE", m)); /* Don't care: PG_NODUMP, PG_ZERO. */ if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE) error = EINVAL; else if (object->memattr != VM_MEMATTR_DEFAULT) error = EINVAL; else if (vm_page_queue(m) != PQ_NONE && !vm_page_busied(m)) { KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("page %p has an unexpected memattr", m)); KASSERT((m->oflags & (VPO_SWAPINPROG | VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0, ("page %p has unexpected oflags", m)); /* Don't care: VPO_NOSYNC. */ if (m->valid != 0) { /* * First, try to allocate a new page * that is above "high". Failing * that, try to allocate a new page * that is below "m_run". Allocate * the new page between the end of * "m_run" and "high" only as a last * resort. */ req = req_class | VM_ALLOC_NOOBJ; if ((m->flags & PG_NODUMP) != 0) req |= VM_ALLOC_NODUMP; if (trunc_page(high) != ~(vm_paddr_t)PAGE_MASK) { m_new = vm_page_alloc_contig( NULL, 0, req, 1, round_page(high), ~(vm_paddr_t)0, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } else m_new = NULL; if (m_new == NULL) { pa = VM_PAGE_TO_PHYS(m_run); m_new = vm_page_alloc_contig( NULL, 0, req, 1, 0, pa - 1, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { pa += ptoa(npages); m_new = vm_page_alloc_contig( NULL, 0, req, 1, pa, high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); } if (m_new == NULL) { error = ENOMEM; goto unlock; } KASSERT(!vm_page_wired(m_new), ("page %p is wired", m_new)); /* * Replace "m" with the new page. For * vm_page_replace(), "m" must be busy * and dequeued. Finally, change "m" * as if vm_page_free() was called. */ if (object->ref_count != 0) pmap_remove_all(m); m_new->aflags = m->aflags & ~PGA_QUEUE_STATE_MASK; KASSERT(m_new->oflags == VPO_UNMANAGED, ("page %p is managed", m_new)); m_new->oflags = m->oflags & VPO_NOSYNC; pmap_copy_page(m, m_new); m_new->valid = m->valid; m_new->dirty = m->dirty; m->flags &= ~PG_ZERO; vm_page_xbusy(m); vm_page_dequeue(m); vm_page_replace_checked(m_new, object, m->pindex, m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); /* * The new page must be deactivated * before the object is unlocked. */ vm_page_change_lock(m_new, &m_mtx); vm_page_deactivate(m_new); } else { m->flags &= ~PG_ZERO; vm_page_dequeue(m); if (vm_page_free_prep(m)) SLIST_INSERT_HEAD(&free, m, plinks.s.ss); KASSERT(m->dirty == 0, ("page %p is dirty", m)); } } else error = EBUSY; unlock: VM_OBJECT_WUNLOCK(object); } else { MPASS(vm_phys_domain(m) == domain); vmd = VM_DOMAIN(domain); vm_domain_free_lock(vmd); order = m->order; if (order < VM_NFREEORDER) { /* * The page is enqueued in the physical memory * allocator's free page queues. Moreover, it * is the first page in a power-of-two-sized * run of contiguous free pages. Jump ahead * to the last page within that run, and * continue from there. */ m += (1 << order) - 1; } #if VM_NRESERVLEVEL > 0 else if (vm_reserv_is_page_free(m)) order = 0; #endif vm_domain_free_unlock(vmd); if (order == VM_NFREEORDER) error = EINVAL; } } if (m_mtx != NULL) mtx_unlock(m_mtx); if ((m = SLIST_FIRST(&free)) != NULL) { int cnt; vmd = VM_DOMAIN(domain); cnt = 0; vm_domain_free_lock(vmd); do { MPASS(vm_phys_domain(m) == domain); SLIST_REMOVE_HEAD(&free, plinks.s.ss); vm_phys_free_pages(m, 0); cnt++; } while ((m = SLIST_FIRST(&free)) != NULL); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, cnt); } return (error); } #define NRUNS 16 CTASSERT(powerof2(NRUNS)); #define RUN_INDEX(count) ((count) & (NRUNS - 1)) #define MIN_RECLAIM 8 /* * vm_page_reclaim_contig: * * Reclaim allocated, contiguous physical memory satisfying the specified * conditions by relocating the virtual pages using that physical memory. * Returns true if reclamation is successful and false otherwise. Since * relocation requires the allocation of physical pages, reclamation may * fail due to a shortage of free pages. When reclamation fails, callers * are expected to perform vm_wait() before retrying a failed allocation * operation, e.g., vm_page_alloc_contig(). * * The caller must always specify an allocation class through "req". * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * The optional allocation flags are ignored. * * "npages" must be greater than zero. Both "alignment" and "boundary" * must be a power of two. */ bool vm_page_reclaim_contig_domain(int domain, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domain *vmd; vm_paddr_t curr_low; vm_page_t m_run, m_runs[NRUNS]; u_long count, reclaimed; int error, i, options, req_class; KASSERT(npages > 0, ("npages is 0")); KASSERT(powerof2(alignment), ("alignment is not a power of 2")); KASSERT(powerof2(boundary), ("boundary is not a power of 2")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Return if the number of free pages cannot satisfy the requested * allocation. */ vmd = VM_DOMAIN(domain); count = vmd->vmd_free_count; if (count < npages + vmd->vmd_free_reserved || (count < npages + vmd->vmd_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) || (count < npages && req_class == VM_ALLOC_INTERRUPT)) return (false); /* * Scan up to three times, relaxing the restrictions ("options") on * the reclamation of reservations and superpages each time. */ for (options = VPSC_NORESERV;;) { /* * Find the highest runs that satisfy the given constraints * and restrictions, and record them in "m_runs". */ curr_low = low; count = 0; for (;;) { m_run = vm_phys_scan_contig(domain, npages, curr_low, high, alignment, boundary, options); if (m_run == NULL) break; curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages); m_runs[RUN_INDEX(count)] = m_run; count++; } /* * Reclaim the highest runs in LIFO (descending) order until * the number of reclaimed pages, "reclaimed", is at least * MIN_RECLAIM. Reset "reclaimed" each time because each * reclamation is idempotent, and runs will (likely) recur * from one scan to the next as restrictions are relaxed. */ reclaimed = 0; for (i = 0; count > 0 && i < NRUNS; i++) { count--; m_run = m_runs[RUN_INDEX(count)]; error = vm_page_reclaim_run(req_class, domain, npages, m_run, high); if (error == 0) { reclaimed += npages; if (reclaimed >= MIN_RECLAIM) return (true); } } /* * Either relax the restrictions on the next scan or return if * the last scan had no restrictions. */ if (options == VPSC_NORESERV) options = VPSC_NOSUPER; else if (options == VPSC_NOSUPER) options = VPSC_ANY; else if (options == VPSC_ANY) return (reclaimed != 0); } } bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary) { struct vm_domainset_iter di; int domain; bool ret; vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req); do { ret = vm_page_reclaim_contig_domain(domain, req, npages, low, high, alignment, boundary); if (ret) break; } while (vm_domainset_iter_page(&di, NULL, &domain) == 0); return (ret); } /* * Set the domain in the appropriate page level domainset. */ void vm_domain_set(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (!vmd->vmd_minset && vm_paging_min(vmd)) { vmd->vmd_minset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_min_domains); } if (!vmd->vmd_severeset && vm_paging_severe(vmd)) { vmd->vmd_severeset = 1; DOMAINSET_SET(vmd->vmd_domain, &vm_severe_domains); } mtx_unlock(&vm_domainset_lock); } /* * Clear the domain from the appropriate page level domainset. */ void vm_domain_clear(struct vm_domain *vmd) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_minset && !vm_paging_min(vmd)) { vmd->vmd_minset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_min_domains); if (vm_min_waiters != 0) { vm_min_waiters = 0; wakeup(&vm_min_domains); } } if (vmd->vmd_severeset && !vm_paging_severe(vmd)) { vmd->vmd_severeset = 0; DOMAINSET_CLR(vmd->vmd_domain, &vm_severe_domains); if (vm_severe_waiters != 0) { vm_severe_waiters = 0; wakeup(&vm_severe_domains); } } /* * If pageout daemon needs pages, then tell it that there are * some free. */ if (vmd->vmd_pageout_pages_needed && vmd->vmd_free_count >= vmd->vmd_pageout_free_min) { wakeup(&vmd->vmd_pageout_pages_needed); vmd->vmd_pageout_pages_needed = 0; } /* See comments in vm_wait_doms(). */ if (vm_pageproc_waiters) { vm_pageproc_waiters = 0; wakeup(&vm_pageproc_waiters); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the min threshold globally. */ void vm_wait_min(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_min()) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } /* * Wait for free pages to exceed the severe threshold globally. */ void vm_wait_severe(void) { mtx_lock(&vm_domainset_lock); while (vm_page_count_severe()) { vm_severe_waiters++; msleep(&vm_severe_domains, &vm_domainset_lock, PVM, "vmwait", 0); } mtx_unlock(&vm_domainset_lock); } u_int vm_wait_count(void) { return (vm_severe_waiters + vm_min_waiters + vm_pageproc_waiters); } void vm_wait_doms(const domainset_t *wdoms) { /* * We use racey wakeup synchronization to avoid expensive global * locking for the pageproc when sleeping with a non-specific vm_wait. * To handle this, we only sleep for one tick in this instance. It * is expected that most allocations for the pageproc will come from * kmem or vm_page_grab* which will use the more specific and * race-free vm_wait_domain(). */ if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); vm_pageproc_waiters++; msleep(&vm_pageproc_waiters, &vm_domainset_lock, PVM | PDROP, "pageprocwait", 1); } else { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(wdoms)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PVM | PDROP, "vmwait", 0); } else mtx_unlock(&vm_domainset_lock); } } /* * vm_wait_domain: * * Sleep until free pages are available for allocation. * - Called in various places after failed memory allocations. */ void vm_wait_domain(int domain) { struct vm_domain *vmd; domainset_t wdom; vmd = VM_DOMAIN(domain); vm_domain_free_assert_unlocked(vmd); if (curproc == pageproc) { mtx_lock(&vm_domainset_lock); if (vmd->vmd_free_count < vmd->vmd_pageout_free_min) { vmd->vmd_pageout_pages_needed = 1; msleep(&vmd->vmd_pageout_pages_needed, &vm_domainset_lock, PDROP | PSWP, "VMWait", 0); } else mtx_unlock(&vm_domainset_lock); } else { if (pageproc == NULL) panic("vm_wait in early boot"); DOMAINSET_ZERO(&wdom); DOMAINSET_SET(vmd->vmd_domain, &wdom); vm_wait_doms(&wdom); } } /* * vm_wait: * * Sleep until free pages are available for allocation in the * affinity domains of the obj. If obj is NULL, the domain set * for the calling thread is used. * Called in various places after failed memory allocations. */ void vm_wait(vm_object_t obj) { struct domainset *d; d = NULL; /* * Carefully fetch pointers only once: the struct domainset * itself is ummutable but the pointer might change. */ if (obj != NULL) d = obj->domain.dr_policy; if (d == NULL) d = curthread->td_domain.dr_policy; vm_wait_doms(&d->ds_mask); } /* * vm_domain_alloc_fail: * * Called when a page allocation function fails. Informs the * pagedaemon and performs the requested wait. Requires the * domain_free and object lock on entry. Returns with the * object lock held and free lock released. Returns an error when * retry is necessary. * */ static int vm_domain_alloc_fail(struct vm_domain *vmd, vm_object_t object, int req) { vm_domain_free_assert_unlocked(vmd); atomic_add_int(&vmd->vmd_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) { if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_wait_domain(vmd->vmd_domain); if (object != NULL) VM_OBJECT_WLOCK(object); if (req & VM_ALLOC_WAITOK) return (EAGAIN); } return (0); } /* * vm_waitpfault: * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(struct domainset *dset, int timo) { /* * XXX Ideally we would wait only until the allocation could * be satisfied. This condition can cause new allocators to * consume all freed pages while old allocators wait. */ mtx_lock(&vm_domainset_lock); if (vm_page_count_min_set(&dset->ds_mask)) { vm_min_waiters++; msleep(&vm_min_domains, &vm_domainset_lock, PUSER | PDROP, "pfault", timo); } else mtx_unlock(&vm_domainset_lock); } static struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { uint8_t queue; if ((queue = atomic_load_8(&m->queue)) == PQ_NONE) return (NULL); return (&vm_pagequeue_domain(m)->vmd_pagequeues[queue]); } static inline void vm_pqbatch_process_page(struct vm_pagequeue *pq, vm_page_t m) { struct vm_domain *vmd; uint8_t qflags; CRITICAL_ASSERT(curthread); vm_pagequeue_assert_locked(pq); /* * The page daemon is allowed to set m->queue = PQ_NONE without * the page queue lock held. In this case it is about to free the page, * which must not have any queue state. */ qflags = atomic_load_8(&m->aflags); KASSERT(pq == vm_page_pagequeue(m) || (qflags & PGA_QUEUE_STATE_MASK) == 0, ("page %p doesn't belong to queue %p but has aflags %#x", m, pq, qflags)); if ((qflags & PGA_DEQUEUE) != 0) { if (__predict_true((qflags & PGA_ENQUEUED) != 0)) vm_pagequeue_remove(pq, m); vm_page_dequeue_complete(m); } else if ((qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)) != 0) { if ((qflags & PGA_ENQUEUED) != 0) TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); else { vm_pagequeue_cnt_inc(pq); vm_page_aflag_set(m, PGA_ENQUEUED); } /* * Give PGA_REQUEUE_HEAD precedence over PGA_REQUEUE. * In particular, if both flags are set in close succession, * only PGA_REQUEUE_HEAD will be applied, even if it was set * first. */ if ((qflags & PGA_REQUEUE_HEAD) != 0) { KASSERT(m->queue == PQ_INACTIVE, ("head enqueue not supported for page %p", m)); vmd = vm_pagequeue_domain(m); TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q); } else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_page_aflag_clear(m, qflags & (PGA_REQUEUE | PGA_REQUEUE_HEAD)); } } static void vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_batchqueue *bq, uint8_t queue) { vm_page_t m; int i; for (i = 0; i < bq->bq_cnt; i++) { m = bq->bq_pa[i]; if (__predict_false(m->queue != queue)) continue; vm_pqbatch_process_page(pq, m); } vm_batchqueue_init(bq); } static void vm_pqbatch_submit_page(vm_page_t m, uint8_t queue) { struct vm_batchqueue *bq; struct vm_pagequeue *pq; int domain; vm_page_assert_locked(m); KASSERT(queue < PQ_COUNT, ("invalid queue %d", queue)); domain = vm_phys_domain(m); pq = &vm_pagequeue_domain(m)->vmd_pagequeues[queue]; critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); if (vm_batchqueue_insert(bq, m)) { critical_exit(); return; } if (!vm_pagequeue_trylock(pq)) { critical_exit(); vm_pagequeue_lock(pq); critical_enter(); bq = DPCPU_PTR(pqbatch[domain][queue]); } vm_pqbatch_process(pq, bq, queue); /* * The page may have been logically dequeued before we acquired the * page queue lock. In this case, the page lock prevents the page * from being logically enqueued elsewhere. */ if (__predict_true(m->queue == queue)) vm_pqbatch_process_page(pq, m); else { KASSERT(m->queue == PQ_NONE, ("invalid queue transition for page %p", m)); KASSERT((m->aflags & PGA_ENQUEUED) == 0, ("page %p is enqueued with invalid queue index", m)); vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); } vm_pagequeue_unlock(pq); critical_exit(); } /* * vm_page_drain_pqbatch: [ internal use only ] * * Force all per-CPU page queue batch queues to be drained. This is * intended for use in severe memory shortages, to ensure that pages * do not remain stuck in the batch queues. */ void vm_page_drain_pqbatch(void) { struct thread *td; struct vm_domain *vmd; struct vm_pagequeue *pq; int cpu, domain, queue; td = curthread; CPU_FOREACH(cpu) { thread_lock(td); sched_bind(td, cpu); thread_unlock(td); for (domain = 0; domain < vm_ndomains; domain++) { vmd = VM_DOMAIN(domain); for (queue = 0; queue < PQ_COUNT; queue++) { pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); critical_enter(); vm_pqbatch_process(pq, DPCPU_PTR(pqbatch[domain][queue]), queue); critical_exit(); vm_pagequeue_unlock(pq); } } } thread_lock(td); sched_unbind(td); thread_unlock(td); } /* * Complete the logical removal of a page from a page queue. We must be * careful to synchronize with the page daemon, which may be concurrently * examining the page with only the page lock held. The page must not be * in a state where it appears to be logically enqueued. */ static void vm_page_dequeue_complete(vm_page_t m) { m->queue = PQ_NONE; atomic_thread_fence_rel(); vm_page_aflag_clear(m, PGA_QUEUE_STATE_MASK); } /* * vm_page_dequeue_deferred: [ internal use only ] * * Request removal of the given page from its current page * queue. Physical removal from the queue may be deferred * indefinitely. * * The page must be locked. */ void vm_page_dequeue_deferred(vm_page_t m) { uint8_t queue; vm_page_assert_locked(m); if ((queue = vm_page_queue(m)) == PQ_NONE) return; vm_page_aflag_set(m, PGA_DEQUEUE); vm_pqbatch_submit_page(m, queue); } /* * vm_page_dequeue: * * Remove the page from whichever page queue it's in, if any. * The page must either be locked or unallocated. This constraint * ensures that the queue state of the page will remain consistent * after this function returns. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq, *pq1; uint8_t aflags; KASSERT(mtx_owned(vm_page_lockptr(m)) || m->object == NULL, ("page %p is allocated and unlocked", m)); for (pq = vm_page_pagequeue(m);; pq = pq1) { if (pq == NULL) { /* * A thread may be concurrently executing * vm_page_dequeue_complete(). Ensure that all queue * state is cleared before we return. */ aflags = atomic_load_8(&m->aflags); if ((aflags & PGA_QUEUE_STATE_MASK) == 0) return; KASSERT((aflags & PGA_DEQUEUE) != 0, ("page %p has unexpected queue state flags %#x", m, aflags)); /* * Busy wait until the thread updating queue state is * finished. Such a thread must be executing in a * critical section. */ cpu_spinwait(); pq1 = vm_page_pagequeue(m); continue; } vm_pagequeue_lock(pq); if ((pq1 = vm_page_pagequeue(m)) == pq) break; vm_pagequeue_unlock(pq); } KASSERT(pq == vm_page_pagequeue(m), ("%s: page %p migrated directly between queues", __func__, m)); KASSERT((m->aflags & PGA_DEQUEUE) != 0 || mtx_owned(vm_page_lockptr(m)), ("%s: queued unlocked page %p", __func__, m)); if ((m->aflags & PGA_ENQUEUED) != 0) vm_pagequeue_remove(pq, m); vm_page_dequeue_complete(m); vm_pagequeue_unlock(pq); } /* * Schedule the given page for insertion into the specified page queue. * Physical insertion of the page may be deferred indefinitely. */ static void vm_page_enqueue(vm_page_t m, uint8_t queue) { vm_page_assert_locked(m); KASSERT(m->queue == PQ_NONE && (m->aflags & PGA_QUEUE_STATE_MASK) == 0, ("%s: page %p is already enqueued", __func__, m)); m->queue = queue; if ((m->aflags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_pqbatch_submit_page(m, queue); } /* * vm_page_requeue: [ internal use only ] * * Schedule a requeue of the given page. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { vm_page_assert_locked(m); KASSERT(vm_page_queue(m) != PQ_NONE, ("%s: page %p is not logically enqueued", __func__, m)); if ((m->aflags & PGA_REQUEUE) == 0) vm_page_aflag_set(m, PGA_REQUEUE); vm_pqbatch_submit_page(m, atomic_load_8(&m->queue)); } /* * vm_page_free_prep: * * Prepares the given page to be put on the free list, * disassociating it from any VM object. The caller may return * the page to the free list only if this function returns true. * * The object must be locked. The page must be locked if it is * managed. */ bool vm_page_free_prep(vm_page_t m) { #if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP) if (PMAP_HAS_DMAP && (m->flags & PG_ZERO) != 0) { uint64_t *p; int i; p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++) KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx", m, i, (uintmax_t)*p)); } #endif if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_prep: freeing mapped page %p", m)); KASSERT((m->aflags & (PGA_EXECUTABLE | PGA_WRITEABLE)) == 0, ("vm_page_free_prep: mapping flags set in page %p", m)); } else { KASSERT(m->queue == PQ_NONE, ("vm_page_free_prep: unmanaged page %p is queued", m)); } VM_CNT_INC(v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free_prep: freeing busy page %p", m); if (m->object != NULL) (void)vm_page_remove(m); /* * If fictitious remove object association and * return. */ if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("fictitious page %p is not wired", m)); KASSERT(m->queue == PQ_NONE, ("fictitious page %p is queued", m)); return (false); } /* * Pages need not be dequeued before they are returned to the physical * memory allocator, but they must at least be marked for a deferred * dequeue. */ if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_dequeue_deferred(m); m->valid = 0; vm_page_undirty(m); if (vm_page_wired(m) != 0) panic("vm_page_free_prep: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free_prep: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; return (false); } /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); #if VM_NRESERVLEVEL > 0 /* * Determine whether the page belongs to a reservation. If the page was * allocated from a per-CPU cache, it cannot belong to a reservation, so * as an optimization, we avoid the check in that case. */ if ((m->flags & PG_PCPU_CACHE) == 0 && vm_reserv_free_page(m)) return (false); #endif return (true); } /* * vm_page_free_toq: * * Returns the given page to the free list, disassociating it * from any VM object. * * The object must be locked. The page must be locked if it is * managed. */ void vm_page_free_toq(vm_page_t m) { struct vm_domain *vmd; uma_zone_t zone; if (!vm_page_free_prep(m)) return; vmd = vm_pagequeue_domain(m); zone = vmd->vmd_pgcache[m->pool].zone; if ((m->flags & PG_PCPU_CACHE) != 0 && zone != NULL) { uma_zfree(zone, m); return; } vm_domain_free_lock(vmd); vm_phys_free_pages(m, 0); vm_domain_free_unlock(vmd); vm_domain_freecnt_inc(vmd, 1); } /* * vm_page_free_pages_toq: * * Returns a list of pages to the free list, disassociating it * from any VM object. In other words, this is equivalent to * calling vm_page_free_toq() for each page of a list of VM objects. * * The objects must be locked. The pages must be locked if it is * managed. */ void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count) { vm_page_t m; int count; if (SLIST_EMPTY(free)) return; count = 0; while ((m = SLIST_FIRST(free)) != NULL) { count++; SLIST_REMOVE_HEAD(free, plinks.s.ss); vm_page_free_toq(m); } if (update_wire_count) vm_wire_sub(count); } /* * vm_page_wire: * * Mark this page as wired down. If the page is fictitious, then * its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (!vm_page_wired(m)) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_wire_add(1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially allowing it to be * paged out. Returns TRUE if the number of wirings transitions to zero and * FALSE otherwise. * * Only managed pages belonging to an object can be paged out. If the number * of wirings transitions to zero and the page is eligible for page out, then * the page is added to the specified paging queue (unless PQ_NONE is * specified, in which case the page is dequeued if it belongs to a paging * queue). * * If a page is fictitious, then its wire count must always be one. * * A managed page must be locked. */ bool vm_page_unwire(vm_page_t m, uint8_t queue) { bool unwired; KASSERT(queue < PQ_COUNT || queue == PQ_NONE, ("vm_page_unwire: invalid queue %u request for page %p", queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); unwired = vm_page_unwire_noq(m); if (!unwired || (m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL) return (unwired); if (vm_page_queue(m) == queue) { if (queue == PQ_ACTIVE) vm_page_reference(m); else if (queue != PQ_NONE) vm_page_requeue(m); } else { vm_page_dequeue(m); if (queue != PQ_NONE) { vm_page_enqueue(m, queue); if (queue == PQ_ACTIVE) /* Initialize act_count. */ vm_page_activate(m); } } return (unwired); } /* * * vm_page_unwire_noq: * * Unwire a page without (re-)inserting it into a page queue. It is up * to the caller to enqueue, requeue, or free the page as appropriate. * In most cases, vm_page_unwire() should be used instead. */ bool vm_page_unwire_noq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_assert_locked(m); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return (false); } if (!vm_page_wired(m)) panic("vm_page_unwire: page %p's wire count is zero", m); m->wire_count--; if (m->wire_count == 0) { vm_wire_sub(1); return (true); } else return (false); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { vm_page_assert_locked(m); if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0) return; if (vm_page_queue(m) == PQ_ACTIVE) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; return; } vm_page_dequeue(m); if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; vm_page_enqueue(m, PQ_ACTIVE); } /* * Move the specified page to the tail of the inactive queue, or requeue * the page if it is already in the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { vm_page_assert_locked(m); if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0) return; if (!vm_page_inactive(m)) { vm_page_dequeue(m); vm_page_enqueue(m, PQ_INACTIVE); } else vm_page_requeue(m); } /* * Move the specified page close to the head of the inactive queue, * bypassing LRU. A marker page is used to maintain FIFO ordering. * As with regular enqueues, we use a per-CPU batch queue to reduce * contention on the page queue lock. * * The page must be locked. */ void vm_page_deactivate_noreuse(vm_page_t m) { vm_page_assert_locked(m); if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0) return; if (!vm_page_inactive(m)) { vm_page_dequeue(m); m->queue = PQ_INACTIVE; } if ((m->aflags & PGA_REQUEUE_HEAD) == 0) vm_page_aflag_set(m, PGA_REQUEUE_HEAD); vm_pqbatch_submit_page(m, PQ_INACTIVE); } /* * vm_page_launder * * Put a page in the laundry, or requeue it if it is already there. */ void vm_page_launder(vm_page_t m) { vm_page_assert_locked(m); if (vm_page_wired(m) || (m->oflags & VPO_UNMANAGED) != 0) return; if (vm_page_in_laundry(m)) vm_page_requeue(m); else { vm_page_dequeue(m); vm_page_enqueue(m, PQ_LAUNDRY); } } /* * vm_page_unswappable * * Put a page in the PQ_UNSWAPPABLE holding queue. */ void vm_page_unswappable(vm_page_t m) { vm_page_assert_locked(m); KASSERT(!vm_page_wired(m) && (m->oflags & VPO_UNMANAGED) == 0, ("page %p already unswappable", m)); vm_page_dequeue(m); vm_page_enqueue(m, PQ_UNSWAPPABLE); } static void vm_page_release_toq(vm_page_t m, int flags) { /* * Use a check of the valid bits to determine whether we should * accelerate reclamation of the page. The object lock might not be * held here, in which case the check is racy. At worst we will either * accelerate reclamation of a valid page and violate LRU, or * unnecessarily defer reclamation of an invalid page. * * If we were asked to not cache the page, place it near the head of the * inactive queue so that is reclaimed sooner. */ if ((flags & (VPR_TRYFREE | VPR_NOREUSE)) != 0 || m->valid == 0) vm_page_deactivate_noreuse(m); else if (vm_page_active(m)) vm_page_reference(m); else vm_page_deactivate(m); } /* * Unwire a page and either attempt to free it or re-add it to the page queues. */ void vm_page_release(vm_page_t m, int flags) { vm_object_t object; bool freed; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release: page %p is unmanaged", m)); vm_page_lock(m); if (m->object != NULL) VM_OBJECT_ASSERT_UNLOCKED(m->object); if (vm_page_unwire_noq(m)) { if ((object = m->object) == NULL) { vm_page_free(m); } else { freed = false; if ((flags & VPR_TRYFREE) != 0 && !vm_page_busied(m) && /* Depends on type stability. */ VM_OBJECT_TRYWLOCK(object)) { /* * Only free unmapped pages. The busy test from * before the object was locked cannot be relied * upon. */ if ((object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && !vm_page_busied(m)) { vm_page_free(m); freed = true; } VM_OBJECT_WUNLOCK(object); } if (!freed) vm_page_release_toq(m, flags); } } vm_page_unlock(m); } /* See vm_page_release(). */ void vm_page_release_locked(vm_page_t m, int flags) { VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_page_release_locked: page %p is unmanaged", m)); vm_page_lock(m); if (vm_page_unwire_noq(m)) { if ((flags & VPR_TRYFREE) != 0 && (m->object->ref_count == 0 || !pmap_page_is_mapped(m)) && m->dirty == 0 && !vm_page_busied(m)) { vm_page_free(m); } else { vm_page_release_toq(m, flags); } } vm_page_unlock(m); } /* * vm_page_advise * * Apply the specified advice to the given page. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) /* * Mark the page clean. This will allow the page to be freed * without first paging it out. MADV_FREE pages are often * quickly reused by malloc(3), so we do not do anything that * would result in a page fault on a later access. */ vm_page_undirty(m); else if (advice != MADV_DONTNEED) { if (advice == MADV_WILLNEED) vm_page_activate(m); return; } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); /* * Place clean pages near the head of the inactive queue rather than * the tail, thus defeating the queue's LRU operation and ensuring that * the page will be reused quickly. Dirty pages not already in the * laundry are moved there. */ if (m->dirty == 0) vm_page_deactivate_noreuse(m); else if (!vm_page_in_laundry(m)) vm_page_launder(m); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; int pflags; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, pflags); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Return the specified range of pages from the given object. For each * page offset within the range, if a page already exists within the object * at that offset and it is busy, then wait for it to change state. If, * instead, the page doesn't exist, then allocate it. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs the pages * * The caller must always specify that the pages are to be busied and/or * wired. * * optional allocation flags: * VM_ALLOC_IGN_SBUSY do not sleep on soft busy pages * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOWAIT do not sleep * VM_ALLOC_SBUSY set page to sbusy state * VM_ALLOC_WIRED wire the pages * VM_ALLOC_ZERO zero and validate any invalid pages * * If VM_ALLOC_NOWAIT is not specified, this routine may sleep. Otherwise, it * may return a partial prefix of the requested range. */ int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags, vm_page_t *ma, int count) { vm_page_t m, mpred; int pflags; int i; bool sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0, ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed")); KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 || (allocflags & VM_ALLOC_WIRED) != 0, ("vm_page_grab_pages: the pages must be busied or wired")); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch")); if (count == 0) return (0); pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY); if ((allocflags & VM_ALLOC_NOWAIT) == 0) pflags |= VM_ALLOC_WAITFAIL; i = 0; retrylookup: m = vm_radix_lookup_le(&object->rtree, pindex + i); if (m == NULL || m->pindex != pindex + i) { mpred = m; m = NULL; } else mpred = TAILQ_PREV(m, pglist, listq); for (; i < count; i++) { if (m != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) break; /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "grbmaw", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); goto retrylookup; } if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); } else { m = vm_page_alloc_after(object, pindex + i, pflags | VM_ALLOC_COUNT(count - i), mpred); if (m == NULL) { if ((allocflags & VM_ALLOC_NOWAIT) != 0) break; goto retrylookup; } } if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) { if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } ma[i] = mpred = m; m = vm_page_next(m); } return (i); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = rounddown2(base, DEV_BSIZE)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zeroed. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zeroed by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * Returns true if all of the specified predicates are true for the entire * (super)page and false otherwise. */ bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m) { vm_object_t object; int i, npages; object = m->object; if (skip_m != NULL && skip_m->object != object) return (false); VM_OBJECT_ASSERT_LOCKED(object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { /* Always test object consistency, including "skip_m". */ if (m[i].object != object) return (false); if (&m[i] == skip_m) continue; if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i])) return (false); if ((flags & PS_ALL_DIRTY) != 0) { /* * Calling vm_page_test_dirty() or pmap_is_modified() * might stop this case from spuriously returning * "false". However, that would require a write lock * on the object containing "m[i]". */ if (m[i].dirty != VM_PAGE_BITS_ALL) return (false); } if ((flags & PS_ALL_VALID) != 0 && m[i].valid != VM_PAGE_BITS_ALL) return (false); } return (true); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits) { if ((bits & PGA_WRITEABLE) == 0) return; /* * The PGA_WRITEABLE flag can only be set if the page is * managed, is exclusively busied or the object is locked. * Currently, this flag is only set by pmap_enter(). */ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("PGA_WRITEABLE on unmanaged page")); if (!vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_free_count()); db_printf("vm_cnt.v_inactive_count: %d\n", vm_inactive_count()); db_printf("vm_cnt.v_active_count: %d\n", vm_active_count()); db_printf("vm_cnt.v_laundry_count: %d\n", vm_laundry_count()); db_printf("vm_cnt.v_wire_count: %d\n", vm_wire_count()); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d\n", vm_free_count()); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d pq_unsw %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys, virt; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; virt = strchr(modif, 'v') != NULL; if (virt) m = PHYS_TO_VM_PAGE(pmap_kextract(addr)); else if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */ Index: stable/12 =================================================================== --- stable/12 (revision 365006) +++ stable/12 (revision 365007) Property changes on: stable/12 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,1 +0,0 ## Reverse-merged /head:r364302,364306