Index: head/sys/vm/device_pager.c =================================================================== --- head/sys/vm/device_pager.c (revision 127960) +++ head/sys/vm/device_pager.c (revision 127961) @@ -1,299 +1,295 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)device_pager.c 8.1 (Berkeley) 6/11/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include static void dev_pager_init(void); static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t); static void dev_pager_dealloc(vm_object_t); static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int); static void dev_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); /* list of device pager objects */ static struct pagerlst dev_pager_object_list; /* protect against object creation */ static struct sx dev_pager_sx; /* protect list manipulation */ static struct mtx dev_pager_mtx; static uma_zone_t fakepg_zone; static vm_page_t dev_pager_getfake(vm_paddr_t); static void dev_pager_putfake(vm_page_t); struct pagerops devicepagerops = { .pgo_init = dev_pager_init, .pgo_alloc = dev_pager_alloc, .pgo_dealloc = dev_pager_dealloc, .pgo_getpages = dev_pager_getpages, .pgo_putpages = dev_pager_putpages, .pgo_haspage = dev_pager_haspage, }; static void dev_pager_init() { TAILQ_INIT(&dev_pager_object_list); sx_init(&dev_pager_sx, "dev_pager create"); mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF); fakepg_zone = uma_zcreate("DP fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE|UMA_ZONE_VM); } /* * MPSAFE */ static vm_object_t dev_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff) { dev_t dev; d_mmap_t *mapfunc; vm_object_t object; vm_pindex_t pindex; unsigned int npages; vm_paddr_t paddr; vm_offset_t off; /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); size = round_page(size); pindex = OFF_TO_IDX(foff + size); /* * Make sure this device can be mapped. */ dev = handle; mtx_lock(&Giant); mapfunc = devsw(dev)->d_mmap; if (mapfunc == NULL || mapfunc == (d_mmap_t *)nullop) { printf("obsolete map function %p\n", (void *)mapfunc); mtx_unlock(&Giant); return (NULL); } /* * Check that the specified range of the device allows the desired * protection. * * XXX assumes VM_PROT_* == PROT_* */ npages = OFF_TO_IDX(size); for (off = foff; npages--; off += PAGE_SIZE) if ((*mapfunc)(dev, off, &paddr, (int)prot) != 0) { mtx_unlock(&Giant); return (NULL); } /* * Lock to prevent object creation race condition. */ sx_xlock(&dev_pager_sx); /* * Look up pager, creating as necessary. */ object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. */ object = vm_object_allocate(OBJT_DEVICE, pindex); object->handle = handle; TAILQ_INIT(&object->un_pager.devp.devp_pglist); mtx_lock(&dev_pager_mtx); TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list); mtx_unlock(&dev_pager_mtx); } else { /* * Gain a reference to the object. */ vm_object_reference(object); if (pindex > object->size) object->size = pindex; } sx_xunlock(&dev_pager_sx); mtx_unlock(&Giant); return (object); } static void dev_pager_dealloc(object) vm_object_t object; { vm_page_t m; mtx_lock(&dev_pager_mtx); TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list); mtx_unlock(&dev_pager_mtx); /* * Free up our fake pages. */ while ((m = TAILQ_FIRST(&object->un_pager.devp.devp_pglist)) != 0) { TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, pageq); dev_pager_putfake(m); } } static int dev_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { vm_pindex_t offset; vm_paddr_t paddr; vm_page_t page; dev_t dev; int i, ret; d_mmap_t *mapfunc; int prot; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); dev = object->handle; offset = m[reqpage]->pindex; VM_OBJECT_UNLOCK(object); prot = PROT_READ; /* XXX should pass in? */ mapfunc = devsw(dev)->d_mmap; if (mapfunc == NULL || mapfunc == (d_mmap_t *)nullop) panic("dev_pager_getpage: no map function"); ret = (*mapfunc)(dev, (vm_offset_t)offset << PAGE_SHIFT, &paddr, prot); KASSERT(ret == 0, ("dev_pager_getpage: map function returns error")); /* * Replace the passed in reqpage page with our own fake page and * free up the all of the original pages. */ page = dev_pager_getfake(paddr); VM_OBJECT_LOCK(object); TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist, page, pageq); vm_page_lock_queues(); for (i = 0; i < count; i++) vm_page_free(m[i]); vm_page_unlock_queues(); vm_page_insert(page, object, offset); m[reqpage] = page; return (VM_PAGER_OK); } static void dev_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { panic("dev_pager_putpage called"); } static boolean_t dev_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { if (before != NULL) *before = 0; if (after != NULL) *after = 0; return (TRUE); } static vm_page_t dev_pager_getfake(paddr) vm_paddr_t paddr; { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK); m->flags = PG_BUSY | PG_FICTITIOUS; m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; m->busy = 0; m->queue = PQ_NONE; m->object = NULL; m->wire_count = 1; m->hold_count = 0; m->phys_addr = paddr; return (m); } static void dev_pager_putfake(m) vm_page_t m; { if (!(m->flags & PG_FICTITIOUS)) panic("dev_pager_putfake: bad page"); uma_zfree(fakepg_zone, m); } Index: head/sys/vm/pmap.h =================================================================== --- head/sys/vm/pmap.h (revision 127960) +++ head/sys/vm/pmap.h (revision 127961) @@ -1,143 +1,139 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Machine address mapping definitions -- machine-independent * section. [For machine-dependent section, see "machine/pmap.h".] */ #ifndef _PMAP_VM_ #define _PMAP_VM_ /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they * so choose, but are expected to return the statistics * in the following structure. */ struct pmap_statistics { long resident_count; /* # of pages mapped (total) */ long wired_count; /* # of pages wired */ }; typedef struct pmap_statistics *pmap_statistics_t; #include #ifdef _KERNEL struct proc; struct thread; /* * Updates to kernel_vm_end are synchronized by the kernel_map's system mutex. */ extern vm_offset_t kernel_vm_end; extern int pmap_pagedaemon_waken; void pmap_change_wiring(pmap_t, vm_offset_t, boolean_t); void pmap_clear_modify(vm_page_t m); void pmap_clear_reference(vm_page_t m); void pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page(vm_page_t, vm_page_t); void pmap_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, boolean_t); vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte); vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va); vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot); void pmap_growkernel(vm_offset_t); void pmap_init(void); boolean_t pmap_is_modified(vm_page_t m); boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t va); boolean_t pmap_ts_referenced(vm_page_t m); vm_offset_t pmap_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int); void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size); boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m); void pmap_page_protect(vm_page_t m, vm_prot_t prot); void pmap_pinit(pmap_t); void pmap_pinit0(pmap_t); void pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t); void pmap_qenter(vm_offset_t, vm_page_t *, int); void pmap_qremove(vm_offset_t, int); void pmap_release(pmap_t); void pmap_remove(pmap_t, vm_offset_t, vm_offset_t); void pmap_remove_all(vm_page_t m); void pmap_remove_pages(pmap_t, vm_offset_t, vm_offset_t); void pmap_zero_page(vm_page_t); void pmap_zero_page_area(vm_page_t, int off, int size); void pmap_zero_page_idle(vm_page_t); int pmap_mincore(pmap_t pmap, vm_offset_t addr); void pmap_activate(struct thread *td); vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size); void *pmap_kenter_temporary(vm_offset_t pa, int i); void pmap_init2(void); #define pmap_resident_count(pm) ((pm)->pm_stats.resident_count) #define pmap_wired_count(pm) ((pm)->pm_stats.wired_count) #endif /* _KERNEL */ #endif /* _PMAP_VM_ */ Index: head/sys/vm/swap_pager.h =================================================================== --- head/sys/vm/swap_pager.h (revision 127960) +++ head/sys/vm/swap_pager.h (revision 127961) @@ -1,59 +1,55 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)swap_pager.h 7.1 (Berkeley) 12/5/90 * $FreeBSD$ */ #ifndef _VM_SWAP_PAGER_H_ #define _VM_SWAP_PAGER_H_ 1 #ifdef _KERNEL extern int swap_pager_full; extern int swap_pager_avail; struct swdevt; void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int); void swap_pager_freespace(vm_object_t, vm_pindex_t, vm_size_t); void swap_pager_swap_init(void); int swap_pager_isswapped(vm_object_t, struct swdevt *); int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t); void swap_pager_status(int *total, int *used); #endif /* _KERNEL */ #endif /* _VM_SWAP_PAGER_H_ */ Index: head/sys/vm/vm.h =================================================================== --- head/sys/vm/vm.h (revision 127960) +++ head/sys/vm/vm.h (revision 127961) @@ -1,134 +1,130 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm.h 8.2 (Berkeley) 12/13/93 * @(#)vm_prot.h 8.1 (Berkeley) 6/11/93 * @(#)vm_inherit.h 8.1 (Berkeley) 6/11/93 * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef VM_H #define VM_H typedef char vm_inherit_t; /* inheritance codes */ #define VM_INHERIT_SHARE ((vm_inherit_t) 0) #define VM_INHERIT_COPY ((vm_inherit_t) 1) #define VM_INHERIT_NONE ((vm_inherit_t) 2) #define VM_INHERIT_DEFAULT VM_INHERIT_COPY typedef u_char vm_prot_t; /* protection codes */ #define VM_PROT_NONE ((vm_prot_t) 0x00) #define VM_PROT_READ ((vm_prot_t) 0x01) #define VM_PROT_WRITE ((vm_prot_t) 0x02) #define VM_PROT_EXECUTE ((vm_prot_t) 0x04) #define VM_PROT_OVERRIDE_WRITE ((vm_prot_t) 0x08) /* copy-on-write */ #define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) #define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) #define VM_PROT_DEFAULT VM_PROT_ALL union vm_map_object; typedef union vm_map_object vm_map_object_t; struct vm_map_entry; typedef struct vm_map_entry *vm_map_entry_t; struct vm_map; typedef struct vm_map *vm_map_t; struct vm_object; typedef struct vm_object *vm_object_t; #ifndef _KERNEL /* * This is defined in for the kernel so that non-vm kernel * sources (mainly Mach-derived ones such as ddb) don't have to include * vm stuff. Defining it there for applications might break things. * Define it here for "applications" that include vm headers (e.g., * genassym). */ typedef int boolean_t; /* * This is defined in for the kernel so that vnode_if.h * doesn't have to include . */ struct vm_page; typedef struct vm_page *vm_page_t; #endif /* _KERNEL */ /* * Information passed from the machine-independant VM initialization code * for use by machine-dependant code (mainly for MMU support) */ struct kva_md_info { vm_offset_t buffer_sva; vm_offset_t buffer_eva; vm_offset_t clean_sva; vm_offset_t clean_eva; vm_offset_t pager_sva; vm_offset_t pager_eva; }; extern struct kva_md_info kmi; extern void vm_ksubmap_init(struct kva_md_info *); #endif /* VM_H */ Index: head/sys/vm/vm_contig.c =================================================================== --- head/sys/vm/vm_contig.c (revision 127960) +++ head/sys/vm/vm_contig.c (revision 127961) @@ -1,320 +1,316 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vm_contig_launder(int queue) { vm_object_t object; vm_page_t m, m_tmp, next; struct vnode *vp; for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) { next = TAILQ_NEXT(m, pageq); KASSERT(m->queue == queue, ("vm_contig_launder: page %p's queue is not %d", m, queue)); if (!VM_OBJECT_TRYLOCK(m->object)) continue; if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) { VM_OBJECT_UNLOCK(m->object); vm_page_lock_queues(); return (TRUE); } vm_page_test_dirty(m); if (m->dirty == 0 && m->busy == 0 && m->hold_count == 0) pmap_remove_all(m); if (m->dirty) { object = m->object; if (object->type == OBJT_VNODE) { vm_page_unlock_queues(); vp = object->handle; VM_OBJECT_UNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); VM_OBJECT_LOCK(object); vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0, curthread); vm_page_lock_queues(); return (TRUE); } else if (object->type == OBJT_SWAP || object->type == OBJT_DEFAULT) { m_tmp = m; vm_pageout_flush(&m_tmp, 1, 0); VM_OBJECT_UNLOCK(object); return (TRUE); } } else if (m->busy == 0 && m->hold_count == 0) vm_page_cache(m); VM_OBJECT_UNLOCK(m->object); } return (FALSE); } /* * This interface is for merging with malloc() someday. * Even if we never implement compaction so that contiguous allocation * works after initialization time, malloc()'s data structures are good * for statistics and for allocations of less than a page. */ static void * contigmalloc1( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary, vm_map_t map) { int i, start; vm_paddr_t phys; vm_object_t object; vm_offset_t addr, tmp_addr; int pass, pqtype; vm_page_t pga = vm_page_array; size = round_page(size); if (size == 0) panic("contigmalloc1: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("contigmalloc1: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("contigmalloc1: boundary must be a power of 2"); start = 0; for (pass = 0; pass <= 1; pass++) { vm_page_lock_queues(); again0: mtx_lock_spin(&vm_page_queue_free_mtx); again: /* * Find first page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ for (i = start; i < cnt.v_page_count; i++) { phys = VM_PAGE_TO_PHYS(&pga[i]); pqtype = pga[i].queue - pga[i].pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) break; } /* * If the above failed or we will exceed the upper bound, fail. */ if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { mtx_unlock_spin(&vm_page_queue_free_mtx); again1: if (vm_contig_launder(PQ_INACTIVE)) goto again1; if (vm_contig_launder(PQ_ACTIVE)) goto again1; vm_page_unlock_queues(); continue; } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { pqtype = pga[i].queue - pga[i].pc; if ((VM_PAGE_TO_PHYS(&pga[i]) != (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) { start++; goto again; } } mtx_unlock_spin(&vm_page_queue_free_mtx); for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; if ((m->queue - m->pc) == PQ_CACHE) { object = m->object; if (!VM_OBJECT_TRYLOCK(object)) { start++; goto again0; } vm_page_busy(m); vm_page_free(m); VM_OBJECT_UNLOCK(object); } } mtx_lock_spin(&vm_page_queue_free_mtx); for (i = start; i < (start + size / PAGE_SIZE); i++) { pqtype = pga[i].queue - pga[i].pc; if (pqtype != PQ_FREE) { start++; goto again; } } for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_pageq_remove_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; /* Don't clear the PG_ZERO flag, we'll need it later. */ m->flags = PG_UNMANAGED | (m->flags & PG_ZERO); KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m)); m->wire_count = 0; m->busy = 0; m->object = NULL; } mtx_unlock_spin(&vm_page_queue_free_mtx); vm_page_unlock_queues(); /* * We've found a contiguous chunk that meets are requirements. * Allocate kernel VM, unfree and assign the physical pages to * it and return kernel VM pointer. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) != KERN_SUCCESS) { /* * XXX We almost never run out of kernel virtual * space, so we don't make the allocated memory * above available. */ vm_map_unlock(map); return (NULL); } vm_object_reference(kernel_object); vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); tmp_addr = addr; VM_OBJECT_LOCK(kernel_object); for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_page_insert(m, kernel_object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) pmap_zero_page(m); tmp_addr += PAGE_SIZE; } VM_OBJECT_UNLOCK(kernel_object); vm_map_wire(map, addr, addr + size, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); return ((void *)addr); } return (NULL); } void * contigmalloc( unsigned long size, /* should be size_t here and for malloc() */ struct malloc_type *type, int flags, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary) { void * ret; mtx_lock(&Giant); ret = contigmalloc1(size, type, flags, low, high, alignment, boundary, kernel_map); mtx_unlock(&Giant); return (ret); } void contigfree(void *addr, unsigned long size, struct malloc_type *type) { kmem_free(kernel_map, (vm_offset_t)addr, size); } Index: head/sys/vm/vm_extern.h =================================================================== --- head/sys/vm/vm_extern.h (revision 127960) +++ head/sys/vm/vm_extern.h (revision 127961) @@ -1,103 +1,99 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 * $FreeBSD$ */ #ifndef _VM_EXTERN_H_ #define _VM_EXTERN_H_ struct buf; struct proc; struct vmspace; struct vmtotal; struct mount; struct vnode; #ifdef _KERNEL #ifdef TYPEDEF_FOR_UAP int getpagesize(struct thread *, void *, int *); int madvise(struct thread *, void *, int *); int mincore(struct thread *, void *, int *); int mprotect(struct thread *, void *, int *); int msync(struct thread *, void *, int *); int munmap(struct thread *, void *, int *); int obreak(struct thread *, void *, int *); int sbrk(struct thread *, void *, int *); int sstk(struct thread *, void *, int *); int swapon(struct thread *, void *, int *); #endif /* TYPEDEF_FOR_UAP */ int kernacc(void *, int, int); vm_offset_t kmem_alloc(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_pageable(vm_map_t, vm_size_t); vm_offset_t kmem_alloc_wait(vm_map_t, vm_size_t); void kmem_free(vm_map_t, vm_offset_t, vm_size_t); void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t); void kmem_init(vm_offset_t, vm_offset_t); vm_offset_t kmem_malloc(vm_map_t, vm_size_t, boolean_t); vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t); void swapout_procs(int); int useracc(void *, int, int); int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t); void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t); int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t); void vm_forkproc(struct thread *, struct proc *, struct thread *, int); void vm_waitproc(struct proc *); int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, void *, vm_ooffset_t); void vm_set_page_size(void); struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t); struct vmspace *vmspace_fork(struct vmspace *); void vmspace_exec(struct proc *, vm_offset_t, vm_offset_t); void vmspace_unshare(struct proc *); void vmspace_free(struct vmspace *); void vmspace_exitfree(struct proc *); void vnode_pager_setsize(struct vnode *, vm_ooffset_t); int vslock(void *, size_t); void vsunlock(void *, size_t); void vm_object_print(/* db_expr_t */ long, boolean_t, /* db_expr_t */ long, char *); int vm_fault_quick(caddr_t v, int prot); void vm_proc_new(struct proc *p); void vm_proc_dispose(struct proc *p); void vm_thread_dispose(struct thread *td); void vm_thread_dispose_altkstack(struct thread *td); void vm_thread_new(struct thread *td, int pages); void vm_thread_new_altkstack(struct thread *td, int pages); void vm_thread_swapin(struct thread *td); void vm_thread_swapout(struct thread *td); #endif /* _KERNEL */ #endif /* !_VM_EXTERN_H_ */ Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c (revision 127960) +++ head/sys/vm/vm_glue.c (revision 127961) @@ -1,1124 +1,1120 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int maxslp; /* * System initialization * * Note: proc0 from proc.h */ static void vm_init_limits(void *); SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) /* * THIS MUST BE THE LAST INITIALIZATION ITEM!!! * * Note: run scheduling should be divorced from the vm system. */ static void scheduler(void *); SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL) #ifndef NO_SWAPPING static void swapout(struct proc *); static void vm_proc_swapin(struct proc *p); static void vm_proc_swapout(struct proc *p); #endif /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjuction with this call. */ int useracc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); PROC_LOCK(curproc); if (ptoa(npages + pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) > lim_cur(curproc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(curproc); return (ENOMEM); } PROC_UNLOCK(curproc); #if 0 /* * XXX - not yet * * The limit for transient usage of wired pages should be * larger than for "permanent" wired pages (mlock()). * * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ if (npages + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (error == KERN_SUCCESS ? 0 : EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Create the U area for a new process. * This routine directly affects the fork perf for a process. */ void vm_proc_new(struct proc *p) { vm_page_t ma[UAREA_PAGES]; vm_object_t upobj; vm_offset_t up; vm_page_t m; u_int i; /* * Get a kernel virtual address for the U area for this process. */ up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE); if (up == 0) panic("vm_proc_new: upage allocation failed"); p->p_uarea = (struct user *)up; /* * Allocate object and page(s) for the U area. */ upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES); p->p_upages_obj = upobj; VM_OBJECT_LOCK(upobj); for (i = 0; i < UAREA_PAGES; i++) { m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); ma[i] = m; vm_page_lock_queues(); vm_page_wakeup(m); m->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(upobj); /* * Enter the pages into the kernel address space. */ pmap_qenter(up, ma, UAREA_PAGES); } /* * Dispose the U area for a process that has exited. * This routine directly impacts the exit perf of a process. * XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called. */ void vm_proc_dispose(struct proc *p) { vm_object_t upobj; vm_offset_t up; vm_page_t m; upobj = p->p_upages_obj; VM_OBJECT_LOCK(upobj); if (upobj->resident_page_count != UAREA_PAGES) panic("vm_proc_dispose: incorrect number of pages in upobj"); vm_page_lock_queues(); while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) { vm_page_busy(m); vm_page_unwire(m, 0); vm_page_free(m); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(upobj); up = (vm_offset_t)p->p_uarea; pmap_qremove(up, UAREA_PAGES); kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE); vm_object_deallocate(upobj); } #ifndef NO_SWAPPING /* * Allow the U area for a process to be prejudicially paged out. */ static void vm_proc_swapout(struct proc *p) { vm_object_t upobj; vm_offset_t up; vm_page_t m; upobj = p->p_upages_obj; VM_OBJECT_LOCK(upobj); if (upobj->resident_page_count != UAREA_PAGES) panic("vm_proc_dispose: incorrect number of pages in upobj"); vm_page_lock_queues(); TAILQ_FOREACH(m, &upobj->memq, listq) { vm_page_dirty(m); vm_page_unwire(m, 0); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(upobj); up = (vm_offset_t)p->p_uarea; pmap_qremove(up, UAREA_PAGES); } /* * Bring the U area for a specified process back in. */ static void vm_proc_swapin(struct proc *p) { vm_page_t ma[UAREA_PAGES]; vm_object_t upobj; vm_offset_t up; vm_page_t m; int rv; int i; upobj = p->p_upages_obj; VM_OBJECT_LOCK(upobj); for (i = 0; i < UAREA_PAGES; i++) { m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("vm_proc_swapin: cannot get upage"); } ma[i] = m; } if (upobj->resident_page_count != UAREA_PAGES) panic("vm_proc_swapin: lost pages from upobj"); vm_page_lock_queues(); TAILQ_FOREACH(m, &upobj->memq, listq) { m->valid = VM_PAGE_BITS_ALL; vm_page_wire(m); vm_page_wakeup(m); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(upobj); up = (vm_offset_t)p->p_uarea; pmap_qenter(up, ma, UAREA_PAGES); } /* * Swap in the UAREAs of all processes swapped out to the given device. * The pages in the UAREA are marked dirty and their swap metadata is freed. */ void vm_proc_swapin_all(struct swdevt *devidx) { struct proc *p; vm_object_t object; vm_page_t m; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); object = p->p_upages_obj; if (object != NULL) { VM_OBJECT_LOCK(object); if (swap_pager_isswapped(object, devidx)) { VM_OBJECT_UNLOCK(object); sx_sunlock(&allproc_lock); faultin(p); PROC_UNLOCK(p); VM_OBJECT_LOCK(object); vm_page_lock_queues(); TAILQ_FOREACH(m, &object->memq, listq) vm_page_dirty(m); vm_page_unlock_queues(); swap_pager_freespace(object, 0, object->un_pager.swp.swp_bcount); VM_OBJECT_UNLOCK(object); goto retry; } VM_OBJECT_UNLOCK(object); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } #endif #ifndef KSTACK_MAX_PAGES #define KSTACK_MAX_PAGES 32 #endif /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ void vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i; /* Bounds check */ if (pages <= 1) pages = KSTACK_PAGES; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); td->td_kstack_obj = ksobj; /* * Get a kernel virtual address for this thread's kstack. */ ks = kmem_alloc_nofault(kernel_map, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); if (ks == 0) panic("vm_thread_new: kstack allocation failed"); if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } td->td_kstack = ks; /* * Knowing the number of pages allocated is useful when you * want to deallocate them. */ td->td_kstack_pages = pages; /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { /* * Get a kernel stack page. */ m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED); ma[i] = m; vm_page_lock_queues(); vm_page_wakeup(m); m->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(ks, ma, pages); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m; int i, pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; pmap_qremove(ks, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock_queues(); vm_page_busy(m); vm_page_unwire(m, 0); vm_page_free(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); vm_object_deallocate(ksobj); kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Allow a thread's kernel stack to be paged out. */ void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_lock_queues(); vm_page_dirty(m); vm_page_unwire(m, 0); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ void vm_thread_swapin(struct thread *td) { vm_object_t ksobj; vm_page_t m, ma[KSTACK_MAX_PAGES]; int i, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_LOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(ksobj, &m, 1, 0); if (rv != VM_PAGER_OK) panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid); m = vm_page_lookup(ksobj, i); m->valid = VM_PAGE_BITS_ALL; } ma[i] = m; vm_page_lock_queues(); vm_page_wire(m); vm_page_wakeup(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } /* * Set up a variable-sized alternate kstack. */ void vm_thread_new_altkstack(struct thread *td, int pages) { td->td_altkstack = td->td_kstack; td->td_altkstack_obj = td->td_kstack_obj; td->td_altkstack_pages = td->td_kstack_pages; vm_thread_new(td, pages); } /* * Restore the original kstack. */ void vm_thread_dispose_altkstack(struct thread *td) { vm_thread_dispose(td); td->td_kstack = td->td_altkstack; td->td_kstack_obj = td->td_altkstack_obj; td->td_kstack_pages = td->td_altkstack_pages; td->td_altkstack = 0; td->td_altkstack_obj = NULL; td->td_altkstack_pages = 0; } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ void vm_forkproc(td, p2, td2, flags) struct thread *td; struct proc *p2; struct thread *td2; int flags; { struct proc *p1 = td->td_proc; struct user *up; GIANT_REQUIRED; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { vmspace_unshare(p1); } } cpu_fork(td, p2, td2, flags); return; } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; p1->p_vmspace->vm_refcnt++; } while (vm_page_count_severe()) { VM_WAIT; } if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* XXXKSE this is unsatisfactory but should be adequate */ up = p2->p_uarea; MPASS(p2->p_sigacts != NULL); /* * p_stats currently points at fields in the user struct * but not at &u, instead at p_addr. Copy parts of * p_stats; zero the rest of p_stats (statistics). */ p2->p_stats = &up->u_stats; bzero(&up->u_stats.pstat_startzero, (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - (caddr_t) &up->u_stats.pstat_startzero)); bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, ((caddr_t) &up->u_stats.pstat_endcopy - (caddr_t) &up->u_stats.pstat_startcopy)); /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); } /* * Called after process has been wait(2)'ed apon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { GIANT_REQUIRED; vmspace_exitfree(p); /* and clean-out the vmspace */ } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. * * XXX should probably act directly on proc0. */ static void vm_init_limits(udata) void *udata; { struct proc *p = udata; struct plimit *limp; int rss_limit; /* * Set up the initial limits on process VM. Set the maximum resident * set size to be half of (reasonably) available memory. Since this * is a soft limit, it comes into effect only when the system is out * of memory - half of main memory helps to favor smaller processes, * and reduces thrashing of the object cache. */ limp = p->p_limit; limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; /* limit the limit to no less than 2MB */ rss_limit = max(cnt.v_free_count, 512); limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } void faultin(p) struct proc *p; { #ifdef NO_SWAPPING PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_sflag & PS_INMEM) == 0) panic("faultin: proc swapped out with NO_SWAPPING!"); #else /* !NO_SWAPPING */ struct thread *td; GIANT_REQUIRED; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_sflag & PS_SWAPPINGIN) msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0); else if ((p->p_sflag & PS_INMEM) == 0) { /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; mtx_lock_spin(&sched_lock); p->p_sflag |= PS_SWAPPINGIN; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); vm_proc_swapin(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPINGIN; p->p_sflag |= PS_INMEM; FOREACH_THREAD_IN_PROC(p, td) { TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) setrunnable(td); } mtx_unlock_spin(&sched_lock); wakeup(&p->p_sflag); /* Allow other threads to swap p out now. */ --p->p_lock; } #endif /* NO_SWAPPING */ } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. * * XXXKSE - process with the thread with highest priority counts.. * * Giant is still held at this point, to be released in tsleep. */ /* ARGSUSED*/ static void scheduler(dummy) void *dummy; { struct proc *p; struct thread *td; int pri; struct proc *pp; int ppri; mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); /* GIANT_REQUIRED */ loop: if (vm_page_count_min()) { VM_WAIT; goto loop; } pp = NULL; ppri = INT_MIN; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct ksegrp *kg; if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { continue; } mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ if (td->td_inhibitors == TDI_SWAPPED) { kg = td->td_ksegrp; pri = p->p_swtime + kg->kg_slptime; if ((p->p_sflag & PS_SWAPINREQ) == 0) { pri -= kg->kg_nice * 8; } /* * if this ksegrp is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { pp = p; ppri = pri; } } } mtx_unlock_spin(&sched_lock); } sx_sunlock(&allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { tsleep(&proc0, PVM, "sched", maxslp * hz / 2); goto loop; } PROC_LOCK(p); /* * Another process may be bringing or may have already * brought this process in while we traverse all threads. * Or, this process may even be being swapped out again. */ if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) { PROC_UNLOCK(p); goto loop; } mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPINREQ; mtx_unlock_spin(&sched_lock); /* * We would like to bring someone in. (only if there is space). * [What checks the space? ] */ faultin(p); PROC_UNLOCK(p); mtx_lock_spin(&sched_lock); p->p_swtime = 0; mtx_unlock_spin(&sched_lock); goto loop; } #ifndef NO_SWAPPING /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_procs(action) int action; { struct proc *p; struct thread *td; struct ksegrp *kg; int didswap = 0; GIANT_REQUIRED; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct vmspace *vm; int minslptime = 100000; /* * Watch out for a process in * creation. It may have no * address space or lock yet. */ mtx_lock_spin(&sched_lock); if (p->p_state == PRS_NEW) { mtx_unlock_spin(&sched_lock); continue; } mtx_unlock_spin(&sched_lock); /* * An aio daemon switches its * address space while running. * Perform a quick check whether * a process has P_SYSTEM. */ if ((p->p_flag & P_SYSTEM) != 0) continue; /* * Do not swapout a process that * is waiting for VM data * structures as there is a possible * deadlock. Test this first as * this may block. * * Lock the map until swapout * finishes, or a thread of this * process may attempt to alter * the map. */ PROC_LOCK(p); vm = p->p_vmspace; KASSERT(vm != NULL, ("swapout_procs: a process has no address space")); ++vm->vm_refcnt; PROC_UNLOCK(p); if (!vm_map_trylock(&vm->vm_map)) goto nextproc1; PROC_LOCK(p); if (p->p_lock != 0 || (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) ) != 0) { goto nextproc2; } /* * only aiod changes vmspace, however it will be * skipped because of the if statement above checking * for P_SYSTEM */ if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM) goto nextproc2; switch (p->p_state) { default: /* Don't swap out processes in any sort * of 'special' state. */ break; case PRS_NORMAL: mtx_lock_spin(&sched_lock); /* * do not swapout a realtime process * Check all the thread groups.. */ FOREACH_KSEGRP_IN_PROC(p, kg) { if (PRI_IS_REALTIME(kg->kg_pri_class)) goto nextproc; /* * Guarantee swap_idle_threshold1 * time in memory. */ if (kg->kg_slptime < swap_idle_threshold1) goto nextproc; /* * Do not swapout a process if it is * waiting on a critical event of some * kind or there is a thread whose * pageable memory may be accessed. * * This could be refined to support * swapping out a thread. */ FOREACH_THREAD_IN_GROUP(kg, td) { if ((td->td_priority) < PSOCK || !thread_safetoswapout(td)) goto nextproc; } /* * If the system is under memory stress, * or if we are swapping * idle processes >= swap_idle_threshold2, * then swap the process out. */ if (((action & VM_SWAP_NORMAL) == 0) && (((action & VM_SWAP_IDLE) == 0) || (kg->kg_slptime < swap_idle_threshold2))) goto nextproc; if (minslptime > kg->kg_slptime) minslptime = kg->kg_slptime; } /* * If the process has been asleep for awhile and had * most of its pages taken away already, swap it out. */ if ((action & VM_SWAP_NORMAL) || ((action & VM_SWAP_IDLE) && (minslptime > swap_idle_threshold2))) { swapout(p); didswap++; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); sx_sunlock(&allproc_lock); goto retry; } nextproc: mtx_unlock_spin(&sched_lock); } nextproc2: PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); nextproc1: vmspace_free(vm); continue; } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapout(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM, ("swapout: lost a swapout race?")); #if defined(INVARIANTS) /* * Make sure that all threads are safe to be swapped out. * * Alternatively, we could swap out only safe threads. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(thread_safetoswapout(td), ("swapout: there is a thread not safe for swapout")); } #endif /* INVARIANTS */ ++p->p_stats->p_ru.ru_nswap; /* * remember the process resident count */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); p->p_sflag &= ~PS_INMEM; p->p_sflag |= PS_SWAPPINGOUT; PROC_UNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) TD_SET_SWAPPED(td); mtx_unlock_spin(&sched_lock); vm_proc_swapout(p); FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPINGOUT; p->p_swtime = 0; } #endif /* !NO_SWAPPING */ Index: head/sys/vm/vm_init.c =================================================================== --- head/sys/vm/vm_init.c (revision 127960) +++ head/sys/vm/vm_init.c (revision 127961) @@ -1,213 +1,209 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_init.c 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Initialize the Virtual Memory subsystem. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include long physmem; /* * System initialization */ static void vm_mem_init(void *); SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL) /* * vm_init initializes the virtual memory system. * This is done only by the first cpu up. * * The start and end address of physical memory is passed in. */ /* ARGSUSED*/ static void vm_mem_init(dummy) void *dummy; { /* * Initializes resident memory structures. From here on, all physical * memory is accounted for, and we use only virtual addresses. */ vm_set_page_size(); virtual_avail = vm_page_startup(virtual_avail); /* * Initialize other VM packages */ vm_object_init(); vm_map_startup(); kmem_init(virtual_avail, virtual_end); pmap_init(); vm_pager_init(); } void vm_ksubmap_init(struct kva_md_info *kmi) { vm_offset_t firstaddr; caddr_t v; vm_size_t size = 0; long physmem_est; vm_offset_t minaddr; vm_offset_t maxaddr; vm_map_t clean_map; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; v = kern_timeout_callwheel_alloc(v); /* * Discount the physical memory larger than the size of kernel_map * to avoid eating up all of KVA space. */ if (kernel_map->first_free == NULL) { printf("Warning: no free entries in kernel_map.\n"); physmem_est = physmem; } else { physmem_est = lmin(physmem, btoc(kernel_map->max_offset - kernel_map->min_offset)); } v = kern_vfs_bio_buffer_alloc(v, physmem_est); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)v; firstaddr = kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)((char *)v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva, &kmi->buffer_eva, (nbuf*BKVASIZE)); buffer_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva); /* * XXX: Mbuf system machine-specific initializations should * go here, if anywhere. */ /* * Initialize the callouts we just allocated. */ kern_timeout_callwheel_init(); } Index: head/sys/vm/vm_kern.c =================================================================== --- head/sys/vm/vm_kern.c (revision 127960) +++ head/sys/vm/vm_kern.c (revision 127961) @@ -1,539 +1,535 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Kernel memory management. */ #include __FBSDID("$FreeBSD$"); #include #include #include /* for ticks and hz */ #include #include #include #include #include #include #include #include #include #include #include #include vm_map_t kernel_map=0; vm_map_t kmem_map=0; vm_map_t exec_map=0; vm_map_t pipe_map; vm_map_t buffer_map=0; /* * kmem_alloc_pageable: * * Allocate pageable memory to the kernel's address map. * "map" must be kernel_map or a submap of kernel_map. */ vm_offset_t kmem_alloc_pageable(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; int result; size = round_page(size); addr = vm_map_min(map); result = vm_map_find(map, NULL, 0, &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); if (result != KERN_SUCCESS) { return (0); } return (addr); } /* * kmem_alloc_nofault: * * Allocate a virtual address range with no underlying object and * no initial mapping to physical memory. Any mapping from this * range to physical memory must be explicitly created prior to * its use, typically with pmap_qenter(). Any attempt to create * a mapping on demand through vm_fault() will result in a panic. */ vm_offset_t kmem_alloc_nofault(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; int result; size = round_page(size); addr = vm_map_min(map); result = vm_map_find(map, NULL, 0, &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); if (result != KERN_SUCCESS) { return (0); } return (addr); } /* * Allocate wired-down memory in the kernel's address map * or a submap. */ vm_offset_t kmem_alloc(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; vm_offset_t offset; vm_offset_t i; size = round_page(size); /* * Use the kernel object for wired-down kernel pages. Assume that no * region of the kernel object is referenced more than once. */ /* * Locate sufficient space in the map. This will give us the final * virtual address for the new memory, and thus will tell us the * offset within the kernel map. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr)) { vm_map_unlock(map); return (0); } offset = addr - VM_MIN_KERNEL_ADDRESS; vm_object_reference(kernel_object); vm_map_insert(map, kernel_object, offset, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Guarantee that there are pages already in this object before * calling vm_map_wire. This is to prevent the following * scenario: * * 1) Threads have swapped out, so that there is a pager for the * kernel_object. 2) The kmsg zone is empty, and so we are * kmem_allocing a new page for it. 3) vm_map_wire calls vm_fault; * there is no page, but there is a pager, so we call * pager_data_request. But the kmsg zone is empty, so we must * kmem_alloc. 4) goto 1 5) Even if the kmsg zone is not empty: when * we get the data back from the pager, it will be (very stale) * non-zero data. kmem_alloc is defined to return zero-filled memory. * * We're intentionally not activating the pages we allocate to prevent a * race with page-out. vm_map_wire will wire the pages. */ VM_OBJECT_LOCK(kernel_object); for (i = 0; i < size; i += PAGE_SIZE) { vm_page_t mem; mem = vm_page_grab(kernel_object, OFF_TO_IDX(offset + i), VM_ALLOC_ZERO | VM_ALLOC_RETRY); if ((mem->flags & PG_ZERO) == 0) pmap_zero_page(mem); mem->valid = VM_PAGE_BITS_ALL; vm_page_lock_queues(); vm_page_unmanage(mem); vm_page_wakeup(mem); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(kernel_object); /* * And finally, mark the data as non-pageable. */ (void) vm_map_wire(map, addr, addr + size, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); return (addr); } /* * kmem_free: * * Release a region of kernel virtual memory allocated * with kmem_alloc, and return the physical pages * associated with that region. * * This routine may not block on kernel maps. */ void kmem_free(map, addr, size) vm_map_t map; vm_offset_t addr; vm_size_t size; { (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size)); } /* * kmem_suballoc: * * Allocates a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * min, max Returned endpoints of map * size Size of range to find */ vm_map_t kmem_suballoc(parent, min, max, size) vm_map_t parent; vm_offset_t *min, *max; vm_size_t size; { int ret; vm_map_t result; size = round_page(size); *min = (vm_offset_t) vm_map_min(parent); ret = vm_map_find(parent, NULL, (vm_offset_t) 0, min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); if (ret != KERN_SUCCESS) { printf("kmem_suballoc: bad status return of %d.\n", ret); panic("kmem_suballoc"); } *max = *min + size; result = vm_map_create(vm_map_pmap(parent), *min, *max); if (result == NULL) panic("kmem_suballoc: cannot create submap"); if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS) panic("kmem_suballoc: unable to change range to submap"); return (result); } /* * kmem_malloc: * * Allocate wired-down memory in the kernel's address map for the higher * level kernel memory allocator (kern/kern_malloc.c). We cannot use * kmem_alloc() because we may need to allocate memory at interrupt * level where we cannot block (canwait == FALSE). * * This routine has its own private kernel submap (kmem_map) and object * (kmem_object). This, combined with the fact that only malloc uses * this routine, ensures that we will never block in map or object waits. * * Note that this still only works in a uni-processor environment and * when called at splhigh(). * * We don't worry about expanding the map (adding entries) since entries * for wired maps are statically allocated. * * NOTE: This routine is not supposed to block if M_NOWAIT is set, but * I have not verified that it actually does not block. * * `map' is ONLY allowed to be kmem_map or one of the mbuf submaps to * which we never free. */ vm_offset_t kmem_malloc(map, size, flags) vm_map_t map; vm_size_t size; int flags; { vm_offset_t offset, i; vm_map_entry_t entry; vm_offset_t addr; vm_page_t m; int pflags; size = round_page(size); addr = vm_map_min(map); /* * Locate sufficient space in the map. This will give us the final * virtual address for the new memory, and thus will tell us the * offset within the kernel map. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr)) { vm_map_unlock(map); if (map != kmem_map) { static int last_report; /* when we did it (in ticks) */ if (ticks < last_report || (ticks - last_report) >= hz) { last_report = ticks; printf("Out of mbuf address space!\n"); printf("Consider increasing NMBCLUSTERS\n"); } return (0); } if ((flags & M_NOWAIT) == 0) panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated", (long)size, (long)map->size); return (0); } offset = addr - VM_MIN_KERNEL_ADDRESS; vm_object_reference(kmem_object); vm_map_insert(map, kmem_object, offset, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); /* * Note: if M_NOWAIT specified alone, allocate from * interrupt-safe queues only (just the free list). If * M_USE_RESERVE is also specified, we can also * allocate from the cache. Neither of the latter two * flags may be specified from an interrupt since interrupts * are not allowed to mess with the cache queue. */ if ((flags & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT) pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED; else pflags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED; if (flags & M_ZERO) pflags |= VM_ALLOC_ZERO; VM_OBJECT_LOCK(kmem_object); for (i = 0; i < size; i += PAGE_SIZE) { retry: m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), pflags); /* * Ran out of space, free everything up and return. Don't need * to lock page queues here as we know that the pages we got * aren't on any queues. */ if (m == NULL) { if ((flags & M_NOWAIT) == 0) { VM_OBJECT_UNLOCK(kmem_object); vm_map_unlock(map); VM_WAIT; vm_map_lock(map); VM_OBJECT_LOCK(kmem_object); goto retry; } /* * Free the pages before removing the map entry. * They are already marked busy. Calling * vm_map_delete before the pages has been freed or * unbusied will cause a deadlock. */ while (i != 0) { i -= PAGE_SIZE; m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i)); vm_page_lock_queues(); vm_page_unwire(m, 0); vm_page_free(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(kmem_object); vm_map_delete(map, addr, addr + size); vm_map_unlock(map); return (0); } if (flags & M_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; vm_page_lock_queues(); vm_page_unmanage(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(kmem_object); /* * Mark map entry as non-pageable. Assert: vm_map_insert() will never * be able to extend the previous entry so there will be a new entry * exactly corresponding to this address range and it will have * wired_count == 0. */ if (!vm_map_lookup_entry(map, addr, &entry) || entry->start != addr || entry->end != addr + size || entry->wired_count != 0) panic("kmem_malloc: entry not found or misaligned"); entry->wired_count = 1; /* * At this point, the kmem_object must be unlocked because * vm_map_simplify_entry() calls vm_object_deallocate(), which * locks the kmem_object. */ vm_map_simplify_entry(map, entry); /* * Loop thru pages, entering them in the pmap. (We cannot add them to * the wired count without wrapping the vm_page_queue_lock in * splimp...) */ VM_OBJECT_LOCK(kmem_object); for (i = 0; i < size; i += PAGE_SIZE) { m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i)); /* * Because this is kernel_pmap, this call will not block. */ pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL, 1); vm_page_lock_queues(); vm_page_flag_set(m, PG_WRITEABLE | PG_REFERENCED); vm_page_wakeup(m); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(kmem_object); vm_map_unlock(map); return (addr); } /* * kmem_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * * This routine may block. */ vm_offset_t kmem_alloc_wait(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; size = round_page(size); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, FALSE); } vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); return (addr); } /* * kmem_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmem_free_wakeup(map, addr, size) vm_map_t map; vm_offset_t addr; vm_size_t size; { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); } /* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. */ void kmem_init(start, end) vm_offset_t start, end; { vm_map_t m; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); m->system_map = 1; vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ kernel_map = m; (void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } Index: head/sys/vm/vm_kern.h =================================================================== --- head/sys/vm/vm_kern.h (revision 127960) +++ head/sys/vm/vm_kern.h (revision 127961) @@ -1,79 +1,75 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_KERN_H_ #define _VM_VM_KERN_H_ 1 /* Kernel memory management definitions. */ extern vm_map_t buffer_map; extern vm_map_t kernel_map; extern vm_map_t kmem_map; extern vm_map_t clean_map; extern vm_map_t exec_map; extern vm_map_t pipe_map; extern u_int vm_kmem_size; #endif /* _VM_VM_KERN_H_ */ Index: head/sys/vm/vm_map.c =================================================================== --- head/sys/vm/vm_map.c (revision 127960) +++ head/sys/vm/vm_map.c (revision 127961) @@ -1,3117 +1,3113 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory mapping module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a single hint is used to speed up lookups. * * Since portions of maps are specified by start/end addresses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one map to * another, and then marking both regions as copy-on-write. */ /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ static struct mtx map_sleep_mtx; static uma_zone_t mapentzone; static uma_zone_t kmapentzone; static uma_zone_t mapzone; static uma_zone_t vmspace_zone; static struct vm_object kmapentobj; static void vmspace_zinit(void *mem, int size); static void vmspace_zfini(void *mem, int size); static void vm_map_zinit(void *mem, int size); static void vm_map_zfini(void *mem, int size); static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max); #ifdef INVARIANTS static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); #endif void vm_map_startup(void) { mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, #ifdef INVARIANTS vm_map_zdtor, #else NULL, #endif vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_prealloc(mapzone, MAX_KMAP); kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_MTXCLASS | UMA_ZONE_VM); uma_prealloc(kmapentzone, MAX_KMAPENT); mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(mapentzone, MAX_MAPENT); } static void vmspace_zfini(void *mem, int size) { struct vmspace *vm; vm = (struct vmspace *)mem; vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map)); } static void vmspace_zinit(void *mem, int size) { struct vmspace *vm; vm = (struct vmspace *)mem; vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map)); } static void vm_map_zfini(void *mem, int size) { vm_map_t map; map = (vm_map_t)mem; mtx_destroy(&map->system_mtx); lockdestroy(&map->lock); } static void vm_map_zinit(void *mem, int size) { vm_map_t map; map = (vm_map_t)mem; map->nentries = 0; map->size = 0; map->infork = 0; mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE); } #ifdef INVARIANTS static void vmspace_zdtor(void *mem, int size, void *arg) { struct vmspace *vm; vm = (struct vmspace *)mem; vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); } static void vm_map_zdtor(void *mem, int size, void *arg) { vm_map_t map; map = (vm_map_t)mem; KASSERT(map->nentries == 0, ("map %p nentries == %d on free.", map, map->nentries)); KASSERT(map->size == 0, ("map %p size == %lu on free.", map, (unsigned long)map->size)); KASSERT(map->infork == 0, ("map %p infork == %d on free.", map, map->infork)); } #endif /* INVARIANTS */ /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. * The remaining fields must be initialized by the caller. */ struct vmspace * vmspace_alloc(min, max) vm_offset_t min, max; { struct vmspace *vm; vm = uma_zalloc(vmspace_zone, M_WAITOK); CTR1(KTR_VM, "vmspace_alloc: %p", vm); _vm_map_init(&vm->vm_map, min, max); pmap_pinit(vmspace_pmap(vm)); vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */ vm->vm_refcnt = 1; vm->vm_shm = NULL; vm->vm_exitingcnt = 0; return (vm); } void vm_init2(void) { uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count, (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8 + maxproc * 2 + maxfiles); vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, #ifdef INVARIANTS vmspace_zdtor, #else NULL, #endif vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); pmap_init2(); } static __inline void vmspace_dofree(struct vmspace *vm) { CTR1(KTR_VM, "vmspace_free: %p", vm); /* * Make sure any SysV shm is freed, it might not have been in * exit1(). */ shmexit(vm); /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ vm_map_lock(&vm->vm_map); (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, vm->vm_map.max_offset); vm_map_unlock(&vm->vm_map); pmap_release(vmspace_pmap(vm)); uma_zfree(vmspace_zone, vm); } void vmspace_free(struct vmspace *vm) { GIANT_REQUIRED; if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); if (--vm->vm_refcnt == 0 && vm->vm_exitingcnt == 0) vmspace_dofree(vm); } void vmspace_exitfree(struct proc *p) { struct vmspace *vm; GIANT_REQUIRED; vm = p->p_vmspace; p->p_vmspace = NULL; /* * cleanup by parent process wait()ing on exiting child. vm_refcnt * may not be 0 (e.g. fork() and child exits without exec()ing). * exitingcnt may increment above 0 and drop back down to zero * several times while vm_refcnt is held non-zero. vm_refcnt * may also increment above 0 and drop back down to zero several * times while vm_exitingcnt is held non-zero. * * The last wait on the exiting child's vmspace will clean up * the remainder of the vmspace. */ if (--vm->vm_exitingcnt == 0 && vm->vm_refcnt == 0) vmspace_dofree(vm); } void _vm_map_lock(vm_map_t map, const char *file, int line) { int error; if (map->system_map) _mtx_lock_flags(&map->system_mtx, 0, file, line); else { error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread); KASSERT(error == 0, ("%s: failed to get lock", __func__)); } map->timestamp++; } void _vm_map_unlock(vm_map_t map, const char *file, int line) { if (map->system_map) _mtx_unlock_flags(&map->system_mtx, 0, file, line); else lockmgr(&map->lock, LK_RELEASE, NULL, curthread); } void _vm_map_lock_read(vm_map_t map, const char *file, int line) { int error; if (map->system_map) _mtx_lock_flags(&map->system_mtx, 0, file, line); else { error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread); KASSERT(error == 0, ("%s: failed to get lock", __func__)); } } void _vm_map_unlock_read(vm_map_t map, const char *file, int line) { if (map->system_map) _mtx_unlock_flags(&map->system_mtx, 0, file, line); else lockmgr(&map->lock, LK_RELEASE, NULL, curthread); } int _vm_map_trylock(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !_mtx_trylock(&map->system_mtx, 0, file, line) : lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, NULL, curthread); if (error == 0) map->timestamp++; return (error == 0); } int _vm_map_trylock_read(vm_map_t map, const char *file, int line) { int error; error = map->system_map ? !_mtx_trylock(&map->system_mtx, 0, file, line) : lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, NULL, curthread); return (error == 0); } int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { #ifdef INVARIANTS _mtx_assert(&map->system_mtx, MA_OWNED, file, line); #endif } else KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE, ("%s: lock not held", __func__)); map->timestamp++; return (0); } void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) { if (map->system_map) { #ifdef INVARIANTS _mtx_assert(&map->system_mtx, MA_OWNED, file, line); #endif } else KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE, ("%s: lock not held", __func__)); } /* * vm_map_unlock_and_wait: */ int vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait) { mtx_lock(&map_sleep_mtx); vm_map_unlock(map); return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 0)); } /* * vm_map_wakeup: */ void vm_map_wakeup(vm_map_t map) { /* * Acquire and release map_sleep_mtx to prevent a wakeup() * from being performed (and lost) between the vm_map_unlock() * and the msleep() in vm_map_unlock_and_wait(). */ mtx_lock(&map_sleep_mtx); mtx_unlock(&map_sleep_mtx); wakeup(&map->root); } long vmspace_resident_count(struct vmspace *vmspace) { return pmap_resident_count(vmspace_pmap(vmspace)); } long vmspace_wired_count(struct vmspace *vmspace) { return pmap_wired_count(vmspace_pmap(vmspace)); } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ vm_map_t vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) { vm_map_t result; result = uma_zalloc(mapzone, M_WAITOK); CTR1(KTR_VM, "vm_map_create: %p", result); _vm_map_init(result, min, max); result->pmap = pmap; return (result); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. * The pmap is set elsewhere. */ static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max) { map->header.next = map->header.prev = &map->header; map->needs_wakeup = FALSE; map->system_map = 0; map->min_offset = min; map->max_offset = max; map->first_free = &map->header; map->root = NULL; map->timestamp = 0; } void vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max) { _vm_map_init(map, min, max); mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) { uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. */ static vm_map_entry_t vm_map_entry_create(vm_map_t map) { vm_map_entry_t new_entry; if (map->system_map) new_entry = uma_zalloc(kmapentzone, M_NOWAIT); else new_entry = uma_zalloc(mapentzone, M_WAITOK); if (new_entry == NULL) panic("vm_map_entry_create: kernel resources exhausted"); return (new_entry); } /* * vm_map_entry_set_behavior: * * Set the expected access behavior, either normal, random, or * sequential. */ static __inline void vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) { entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | (behavior & MAP_ENTRY_BEHAV_MASK); } /* * vm_map_entry_splay: * * Implements Sleator and Tarjan's top-down splay algorithm. Returns * the vm_map_entry containing the given address. If, however, that * address is not found in the vm_map, returns a vm_map_entry that is * adjacent to the address, coming before or after it. */ static vm_map_entry_t vm_map_entry_splay(vm_offset_t address, vm_map_entry_t root) { struct vm_map_entry dummy; vm_map_entry_t lefttreemax, righttreemin, y; if (root == NULL) return (root); lefttreemax = righttreemin = &dummy; for (;; root = y) { if (address < root->start) { if ((y = root->left) == NULL) break; if (address < y->start) { /* Rotate right. */ root->left = y->right; y->right = root; root = y; if ((y = root->left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->left = root; righttreemin = root; } else if (address >= root->end) { if ((y = root->right) == NULL) break; if (address >= y->end) { /* Rotate left. */ root->right = y->left; y->left = root; root = y; if ((y = root->right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->right = root; lefttreemax = root; } else break; } /* Assemble the new root. */ lefttreemax->right = root->left; righttreemin->left = root->right; root->left = dummy.right; root->right = dummy.left; return (root); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ static void vm_map_entry_link(vm_map_t map, vm_map_entry_t after_where, vm_map_entry_t entry) { CTR4(KTR_VM, "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, map->nentries, entry, after_where); map->nentries++; entry->prev = after_where; entry->next = after_where->next; entry->next->prev = entry; after_where->next = entry; if (after_where != &map->header) { if (after_where != map->root) vm_map_entry_splay(after_where->start, map->root); entry->right = after_where->right; entry->left = after_where; after_where->right = NULL; } else { entry->right = map->root; entry->left = NULL; } map->root = entry; } static void vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev, root; if (entry != map->root) vm_map_entry_splay(entry->start, map->root); if (entry->left == NULL) root = entry->right; else { root = vm_map_entry_splay(entry->start, entry->left); root->right = entry->right; } map->root = root; prev = entry->prev; next = entry->next; next->prev = prev; prev->next = next; map->nentries--; CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, map->nentries, entry); } /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry( vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) /* OUT */ { vm_map_entry_t cur; cur = vm_map_entry_splay(address, map->root); if (cur == NULL) *entry = &map->header; else { map->root = cur; if (address >= cur->start) { *entry = cur; if (cur->end > address) return (TRUE); } else *entry = cur->prev; } return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry; vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; vm_eflags_t protoeflags; /* * Check that the start and end points are not bogus. */ if ((start < map->min_offset) || (end > map->max_offset) || (start >= end)) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &temp_entry)) return (KERN_NO_SPACE); prev_entry = temp_entry; /* * Assert that the next entry doesn't overlap the end point. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < end)) return (KERN_NO_SPACE); protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY; if (cow & MAP_NOFAULT) { protoeflags |= MAP_ENTRY_NOFAULT; KASSERT(object == NULL, ("vm_map_insert: paradoxical MAP_NOFAULT request")); } if (cow & MAP_DISABLE_SYNCER) protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; if (object != NULL) { /* * OBJ_ONEMAPPING must be cleared unless this mapping * is trivially proven to be the only mapping for any * of the object's pages. (Object granularity * reference counting is insufficient to recognize * aliases with precision.) */ VM_OBJECT_LOCK(object); if (object->ref_count > 1 || object->shadow_count != 0) vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(object); } else if ((prev_entry != &map->header) && (prev_entry->eflags == protoeflags) && (prev_entry->end == start) && (prev_entry->wired_count == 0) && ((prev_entry->object.vm_object == NULL) || vm_object_coalesce(prev_entry->object.vm_object, OFF_TO_IDX(prev_entry->offset), (vm_size_t)(prev_entry->end - prev_entry->start), (vm_size_t)(end - prev_entry->end)))) { /* * We were able to extend the object. Determine if we * can extend the previous map entry to include the * new range as well. */ if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) && (prev_entry->protection == prot) && (prev_entry->max_protection == max)) { map->size += (end - prev_entry->end); prev_entry->end = end; vm_map_simplify_entry(map, prev_entry); return (KERN_SUCCESS); } /* * If we can extend the object but cannot extend the * map entry, we have to create a new map entry. We * must bump the ref count on the extended object to * account for it. object may be NULL. */ object = prev_entry->object.vm_object; offset = prev_entry->offset + (prev_entry->end - prev_entry->start); vm_object_reference(object); } /* * NOTE: if conditionals fail, object can be NULL here. This occurs * in things like the buffer map where we manage kva but do not manage * backing objects. */ /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; new_entry->avail_ssize = 0; new_entry->inheritance = VM_INHERIT_DEFAULT; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; /* * Insert the new entry into the list */ vm_map_entry_link(map, prev_entry, new_entry); map->size += new_entry->end - new_entry->start; /* * Update the free space hint */ if ((map->first_free == prev_entry) && (prev_entry->end >= new_entry->start)) { map->first_free = new_entry; } #if 0 /* * Temporarily removed to avoid MAP_STACK panic, due to * MAP_STACK being a huge hack. Will be added back in * when MAP_STACK (and the user stack mapping) is fixed. */ /* * It may be possible to simplify the entry */ vm_map_simplify_entry(map, new_entry); #endif if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) { vm_map_pmap_enter(map, start, object, OFF_TO_IDX(offset), end - start, cow & MAP_PREFAULT_PARTIAL); } return (KERN_SUCCESS); } /* * Find sufficient space for `length' bytes in the given map, starting at * `start'. The map must be locked. Returns 0 on success, 1 on no space. */ int vm_map_findspace( vm_map_t map, vm_offset_t start, vm_size_t length, vm_offset_t *addr) { vm_map_entry_t entry, next; vm_offset_t end; if (start < map->min_offset) start = map->min_offset; if (start > map->max_offset) return (1); /* * Look for the first possible address; if there's already something * at this address, we have to start after it. */ if (start == map->min_offset) { if ((entry = map->first_free) != &map->header) start = entry->end; } else { vm_map_entry_t tmp; if (vm_map_lookup_entry(map, start, &tmp)) start = tmp->end; entry = tmp; } /* * Look through the rest of the map, trying to fit a new region in the * gap between existing regions, or after the very last region. */ for (;; start = (entry = next)->end) { /* * Find the end of the proposed new region. Be sure we didn't * go beyond the end of the map, or wrap around the address; * if so, we lose. Otherwise, if this is the last entry, or * if the proposed new region fits before the next entry, we * win. */ end = start + length; if (end > map->max_offset || end < start) return (1); next = entry->next; if (next == &map->header || next->start >= end) break; } *addr = start; if (map == kernel_map) { vm_offset_t ksize; if ((ksize = round_page(start + length)) > kernel_vm_end) { pmap_growkernel(ksize); } } return (0); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * * If object is non-NULL, ref count must be bumped by caller * prior to making call to account for the new entry. */ int vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, /* IN/OUT */ vm_size_t length, boolean_t find_space, vm_prot_t prot, vm_prot_t max, int cow) { vm_offset_t start; int result, s = 0; start = *addr; if (map == kmem_map) s = splvm(); vm_map_lock(map); if (find_space) { if (vm_map_findspace(map, start, length, addr)) { vm_map_unlock(map); if (map == kmem_map) splx(s); return (KERN_NO_SPACE); } start = *addr; } result = vm_map_insert(map, object, offset, start, start + length, prot, max, cow); vm_map_unlock(map); if (map == kmem_map) splx(s); return (result); } /* * vm_map_simplify_entry: * * Simplify the given map entry by merging with either neighbor. This * routine also has the ability to merge with both neighbors. * * The map must be locked. * * This routine guarentees that the passed entry remains valid (though * possibly extended). When merging, this routine may delete one or * both neighbors. */ void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) { vm_map_entry_t next, prev; vm_size_t prevsize, esize; if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) return; prev = entry->prev; if (prev != &map->header) { prevsize = prev->end - prev->start; if ( (prev->end == entry->start) && (prev->object.vm_object == entry->object.vm_object) && (!prev->object.vm_object || (prev->offset + prevsize == entry->offset)) && (prev->eflags == entry->eflags) && (prev->protection == entry->protection) && (prev->max_protection == entry->max_protection) && (prev->inheritance == entry->inheritance) && (prev->wired_count == entry->wired_count)) { if (map->first_free == prev) map->first_free = entry; vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); vm_map_entry_dispose(map, prev); } } next = entry->next; if (next != &map->header) { esize = entry->end - entry->start; if ((entry->end == next->start) && (next->object.vm_object == entry->object.vm_object) && (!entry->object.vm_object || (entry->offset + esize == next->offset)) && (next->eflags == entry->eflags) && (next->protection == entry->protection) && (next->max_protection == entry->max_protection) && (next->inheritance == entry->inheritance) && (next->wired_count == entry->wired_count)) { if (map->first_free == next) map->first_free = entry; vm_map_entry_unlink(map, next); entry->end = next->end; if (next->object.vm_object) vm_object_deallocate(next->object.vm_object); vm_map_entry_dispose(map, next); } } } /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) { vm_map_entry_t new_entry; /* * Split off the front portion -- note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ vm_map_simplify_entry(map, entry); /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; } new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; vm_map_entry_link(map, entry->prev, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if ((endaddr) < (entry->end)) \ _vm_map_clip_end((map), (entry), (endaddr)); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) { vm_map_entry_t new_entry; /* * If there is no object backing this entry, we might as well create * one now. If we defer it, an object can get created after the map * is clipped, and individual objects will be created for the split-up * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ if (entry->object.vm_object == NULL && !map->system_map) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->object.vm_object = object; entry->offset = 0; } /* * Create a new entry and insert it AFTER the specified entry */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); vm_map_entry_link(map, entry, new_entry); if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); } } /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap( vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) { vm_map_entry_t entry; int result = KERN_INVALID_ARGUMENT; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && ((entry->eflags & MAP_ENTRY_COW) == 0) && (entry->object.vm_object == NULL)) { entry->object.sub_map = submap; entry->eflags |= MAP_ENTRY_IS_SUB_MAP; result = KERN_SUCCESS; } vm_map_unlock(map); return (result); } /* * The maximum number of pages to map */ #define MAX_INIT_PT 96 /* * vm_map_pmap_enter: * * Preload the mappings for the given object into the specified * map. This eliminates the soft faults on process startup and * immediately after an mmap(2). */ void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; if (object == NULL) return; mtx_lock(&Giant); VM_OBJECT_LOCK(object); if (object->type == OBJT_DEVICE) { pmap_object_init_pt(map->pmap, addr, object, pindex, size); goto unlock_return; } psize = atop(size); if (object->type != OBJT_VNODE || ((flags & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { goto unlock_return; } if (psize + pindex > object->size) { if (object->size < pindex) goto unlock_return; psize = object->size - pindex; } mpte = NULL; if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < pindex) { p = vm_page_splay(pindex, object->root); if ((object->root = p)->pindex < pindex) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if ((flags & MAP_PREFAULT_MADVISE) && cnt.v_free_count < cnt.v_free_reserved) { break; } vm_page_lock_queues(); if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); mpte = pmap_enter_quick(map->pmap, addr + ptoa(tmpidx), p, mpte); VM_OBJECT_LOCK(object); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); } unlock_return: VM_OBJECT_UNLOCK(object); mtx_unlock(&Giant); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t new_prot, boolean_t set_max) { vm_map_entry_t current; vm_map_entry_t entry; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else { entry = entry->next; } /* * Make a first pass to check for protection violations. */ current = entry; while ((current != &map->header) && (current->start < end)) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } current = current->next; } /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ current = entry; while ((current != &map->header) && (current->start < end)) { vm_prot_t old_prot; vm_map_clip_end(map, current, end); old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * Update physical map if necessary. Worry about copy-on-write * here -- CHECK THIS XXX */ if (current->protection != old_prot) { mtx_lock(&Giant); vm_page_lock_queues(); #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ VM_PROT_ALL) pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(current)); #undef MASK vm_page_unlock_queues(); mtx_unlock(&Giant); } vm_map_simplify_entry(map, current); current = current->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. Advisories are classified as either those effecting * the vm_map_entry structure, or those effecting the underlying * objects. */ int vm_map_madvise( vm_map_t map, vm_offset_t start, vm_offset_t end, int behav) { vm_map_entry_t current, entry; int modify_map = 0; /* * Some madvise calls directly modify the vm_map_entry, in which case * we need to use an exclusive lock on the map and we need to perform * various clipping operations. Otherwise we only need a read-lock * on the map. */ switch(behav) { case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_NOSYNC: case MADV_AUTOSYNC: case MADV_NOCORE: case MADV_CORE: modify_map = 1; vm_map_lock(map); break; case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: vm_map_lock_read(map); break; default: return (KERN_INVALID_ARGUMENT); } /* * Locate starting entry and clip if necessary. */ VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { if (modify_map) vm_map_clip_start(map, entry, start); } else { entry = entry->next; } if (modify_map) { /* * madvise behaviors that are implemented in the vm_map_entry. * * We clip the vm_map_entry so that behavioral changes are * limited to the specified address range. */ for (current = entry; (current != &map->header) && (current->start < end); current = current->next ) { if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; vm_map_clip_end(map, current, end); switch (behav) { case MADV_NORMAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); break; case MADV_SEQUENTIAL: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); break; case MADV_RANDOM: vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); break; case MADV_NOSYNC: current->eflags |= MAP_ENTRY_NOSYNC; break; case MADV_AUTOSYNC: current->eflags &= ~MAP_ENTRY_NOSYNC; break; case MADV_NOCORE: current->eflags |= MAP_ENTRY_NOCOREDUMP; break; case MADV_CORE: current->eflags &= ~MAP_ENTRY_NOCOREDUMP; break; default: break; } vm_map_simplify_entry(map, current); } vm_map_unlock(map); } else { vm_pindex_t pindex; int count; /* * madvise behaviors that are implemented in the underlying * vm_object. * * Since we don't clip the vm_map_entry, we have to clip * the vm_object pindex and count. */ for (current = entry; (current != &map->header) && (current->start < end); current = current->next ) { vm_offset_t useStart; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) continue; pindex = OFF_TO_IDX(current->offset); count = atop(current->end - current->start); useStart = current->start; if (current->start < start) { pindex += atop(start - current->start); count -= atop(start - current->start); useStart = start; } if (current->end > end) count -= atop(current->end - end); if (count <= 0) continue; vm_object_madvise(current->object.vm_object, pindex, count, behav); if (behav == MADV_WILLNEED) { vm_map_pmap_enter(map, useStart, current->object.vm_object, pindex, (count << PAGE_SHIFT), MAP_PREFAULT_MADVISE ); } } vm_map_unlock_read(map); } return (0); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vm_map_fork. */ int vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_inherit_t new_inheritance) { vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: break; default: return (KERN_INVALID_ARGUMENT); } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->inheritance = new_inheritance; vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_unwire: * * Implements both kernel and user unwiring. */ int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t saved_start; unsigned int last_timestamp; int rv; boolean_t need_wakeup, result, user_unwire; user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry != &map->header && entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, user_unwire)) { /* * Allow interruption of user unwiring? */ } vm_map_lock(map); if (last_timestamp+1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * First_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ entry->eflags |= MAP_ENTRY_IN_TRANSITION; /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } /* * Require that the entry is wired. */ if (entry->wired_count == 0 || (user_unwire && (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)) { end = entry->end; rv = KERN_INVALID_ARGUMENT; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_unwire: lookup failed")); } entry = first_entry; while (entry != &map->header && entry->start < end) { if (rv == KERN_SUCCESS) { if (user_unwire) entry->eflags &= ~MAP_ENTRY_USER_WIRED; entry->wired_count--; if (entry->wired_count == 0) { /* * Retain the map lock. */ vm_fault_unwire(map, entry->start, entry->end); } } KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("vm_map_unwire: in-transition flag missing")); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_wire: * * Implements both kernel and user wiring. */ int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) { vm_map_entry_t entry, first_entry, tmp_entry; vm_offset_t saved_end, saved_start; unsigned int last_timestamp; int rv; boolean_t need_wakeup, result, user_wire; user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &first_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) first_entry = first_entry->next; else { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } } last_timestamp = map->timestamp; entry = first_entry; while (entry != &map->header && entry->start < end) { if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { /* * We have not yet clipped the entry. */ saved_start = (start >= entry->start) ? start : entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; if (vm_map_unlock_and_wait(map, user_wire)) { /* * Allow interruption of user wiring? */ } vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) { if (flags & VM_MAP_WIRE_HOLESOK) tmp_entry = tmp_entry->next; else { if (saved_start == start) { /* * first_entry has been deleted. */ vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } end = saved_start; rv = KERN_INVALID_ADDRESS; goto done; } } if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; } last_timestamp = map->timestamp; continue; } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); /* * Mark the entry in case the map lock is released. (See * above.) */ entry->eflags |= MAP_ENTRY_IN_TRANSITION; /* * */ if (entry->wired_count == 0) { entry->wired_count++; saved_start = entry->start; saved_end = entry->end; /* * Release the map lock, relying on the in-transition * mark. */ vm_map_unlock(map); rv = vm_fault_wire(map, saved_start, saved_end, user_wire); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. The entry * may have been clipped, but NOT merged or * deleted. */ result = vm_map_lookup_entry(map, saved_start, &tmp_entry); KASSERT(result, ("vm_map_wire: lookup failed")); if (entry == first_entry) first_entry = tmp_entry; else first_entry = NULL; entry = tmp_entry; while (entry->end < saved_end) { if (rv != KERN_SUCCESS) { KASSERT(entry->wired_count == 1, ("vm_map_wire: bad count")); entry->wired_count = -1; } entry = entry->next; } } last_timestamp = map->timestamp; if (rv != KERN_SUCCESS) { KASSERT(entry->wired_count == 1, ("vm_map_wire: bad count")); /* * Assign an out-of-range value to represent * the failure to wire this entry. */ entry->wired_count = -1; end = entry->end; goto done; } } else if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { entry->wired_count++; } /* * Check the map for holes in the specified region. * If VM_MAP_WIRE_HOLESOK was specified, skip this check. */ if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { end = entry->end; rv = KERN_INVALID_ADDRESS; goto done; } entry = entry->next; } rv = KERN_SUCCESS; done: need_wakeup = FALSE; if (first_entry == NULL) { result = vm_map_lookup_entry(map, start, &first_entry); if (!result && (flags & VM_MAP_WIRE_HOLESOK)) first_entry = first_entry->next; else KASSERT(result, ("vm_map_wire: lookup failed")); } entry = first_entry; while (entry != &map->header && entry->start < end) { if (rv == KERN_SUCCESS) { if (user_wire) entry->eflags |= MAP_ENTRY_USER_WIRED; } else if (entry->wired_count == -1) { /* * Wiring failed on this entry. Thus, unwiring is * unnecessary. */ entry->wired_count = 0; } else { if (!user_wire || (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) entry->wired_count--; if (entry->wired_count == 0) { /* * Retain the map lock. */ vm_fault_unwire(map, entry->start, entry->end); } } KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("vm_map_wire: in-transition flag missing")); entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; need_wakeup = TRUE; } vm_map_simplify_entry(map, entry); entry = entry->next; } vm_map_unlock(map); if (need_wakeup) vm_map_wakeup(map); return (rv); } /* * vm_map_sync * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * If the size of the region from start to end is zero, we are * supposed to flush all modified pages within the region containing * start. Unfortunately, a region can be split or coalesced with * neighboring regions, making it difficult to determine what the * original region was. Therefore, we approximate this requirement by * flushing the current region containing start. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_sync( vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t syncio, boolean_t invalidate) { vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } else if (start == end) { start = entry->start; end = entry->end; } /* * Make a first pass to check for user-wired memory and holes. */ for (current = entry; current->start < end; current = current->next) { if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && (current->next == &map->header || current->end != current->next->start)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } if (invalidate) { mtx_lock(&Giant); vm_page_lock_queues(); pmap_remove(map->pmap, start, end); vm_page_unlock_queues(); mtx_unlock(&Giant); } /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current->start < end; current = current->next) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.sub_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } vm_object_sync(object, offset, size, syncio, invalidate); start += size; } vm_map_unlock_read(map); return (KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) { vm_fault_unwire(map, entry->start, entry->end); entry->wired_count = 0; } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; vm_pindex_t offidxstart, offidxend, count; vm_map_entry_unlink(map, entry); map->size -= entry->end - entry->start; if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && (object = entry->object.vm_object) != NULL) { count = OFF_TO_IDX(entry->end - entry->start); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + count; VM_OBJECT_LOCK(object); if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || object == kernel_object || object == kmem_object) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_collapse(object); vm_object_page_remove(object, offidxstart, offidxend, FALSE); if (object->type == OBJT_SWAP) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) object->size = offidxstart; } VM_OBJECT_UNLOCK(object); vm_object_deallocate(object); } vm_map_entry_dispose(map, entry); } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. */ int vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry; vm_map_entry_t first_entry; /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { entry = first_entry; vm_map_clip_start(map, entry, start); } /* * Save the free space hint */ if (entry == &map->header) { map->first_free = &map->header; } else if (map->first_free->start >= start) { map->first_free = entry->prev; } /* * Step through all entries in this region */ while ((entry != &map->header) && (entry->start < end)) { vm_map_entry_t next; /* * Wait for wiring or unwiring of an entry to complete. */ if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) { unsigned int last_timestamp; vm_offset_t saved_start; vm_map_entry_t tmp_entry; saved_start = entry->start; entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; last_timestamp = map->timestamp; (void) vm_map_unlock_and_wait(map, FALSE); vm_map_lock(map); if (last_timestamp + 1 != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. * Specifically, the entry may have been * clipped, merged, or deleted. */ if (!vm_map_lookup_entry(map, saved_start, &tmp_entry)) entry = tmp_entry->next; else { entry = tmp_entry; vm_map_clip_start(map, entry, saved_start); } } continue; } vm_map_clip_end(map, entry, end); next = entry->next; /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ if (entry->wired_count != 0) { vm_map_entry_unwire(map, entry); } if (!map->system_map) mtx_lock(&Giant); vm_page_lock_queues(); pmap_remove(map->pmap, entry->start, entry->end); vm_page_unlock_queues(); if (!map->system_map) mtx_unlock(&Giant); /* * Delete the entry (which may delete the object) only after * removing all pmap entries pointing to its pages. * (Otherwise, its page frames may be reallocated, and any * modify bits will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) { int result, s = 0; if (map == kmem_map) s = splvm(); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); if (map == kmem_map) splx(s); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified privilege on the * entire address region given. The entire region must be allocated. * * WARNING! This code does not and should not check whether the * contents of the region is accessible. For example a smaller file * might be mapped into a larger address space. * * NOTE! This code is also called by munmap(). * * The map must be locked. A read lock is sufficient. */ boolean_t vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) return (FALSE); entry = tmp_entry; while (start < end) { if (entry == &map->header) return (FALSE); /* * No holes allowed! */ if (start < entry->start) return (FALSE); /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) return (FALSE); /* go to next entry */ start = entry->end; entry = entry->next; } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry( vm_map_t src_map, vm_map_t dst_map, vm_map_entry_t src_entry, vm_map_entry_t dst_entry) { vm_object_t src_object; if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) return; if (src_entry->wired_count == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { vm_page_lock_queues(); pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); vm_page_unlock_queues(); } /* * Make a copy of the object. */ if ((src_object = src_entry->object.vm_object) != NULL) { VM_OBJECT_LOCK(src_object); if ((src_object->handle == NULL) && (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP)) { vm_object_collapse(src_object); if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { vm_object_split(src_entry); src_object = src_entry->object.vm_object; } } vm_object_reference_locked(src_object); vm_object_clear_flag(src_object, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(src_object); dst_entry->object.vm_object = src_object; src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->offset = src_entry->offset; } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { /* * Of course, wired down pages can't be set copy-on-write. * Cause wired pages to be copied into the new map by * simulating faults (the new pages are pageable) */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * The source map must not be locked. */ struct vmspace * vmspace_fork(struct vmspace *vm1) { struct vmspace *vm2; vm_map_t old_map = &vm1->vm_map; vm_map_t new_map; vm_map_entry_t old_entry; vm_map_entry_t new_entry; vm_object_t object; GIANT_REQUIRED; vm_map_lock(old_map); old_map->infork = 1; vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, (caddr_t) &vm1->vm_endcopy - (caddr_t) &vm1->vm_startcopy); new_map = &vm2->vm_map; /* XXX */ new_map->timestamp = 1; /* Do not inherit the MAP_WIREFUTURE property. */ if ((new_map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) new_map->flags &= ~MAP_WIREFUTURE; old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) panic("vm_map_fork: encountered a submap"); switch (old_entry->inheritance) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, creating the shared object if necessary. */ object = old_entry->object.vm_object; if (object == NULL) { object = vm_object_allocate(OBJT_DEFAULT, atop(old_entry->end - old_entry->start)); old_entry->object.vm_object = object; old_entry->offset = (vm_offset_t) 0; } /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. */ vm_object_reference(object); if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { vm_object_shadow(&old_entry->object.vm_object, &old_entry->offset, atop(old_entry->end - old_entry->start)); old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; /* Transfer the second reference too. */ vm_object_reference( old_entry->object.vm_object); vm_object_deallocate(object); object = old_entry->object.vm_object; } VM_OBJECT_LOCK(object); vm_object_clear_flag(object, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(object); /* * Clone the entry, referencing the shared object. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; new_entry->wired_count = 0; /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_map->header.prev, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; vm_map_entry_link(new_map, new_map->header.prev, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry); break; } old_entry = old_entry->next; } new_map->size = old_map->size; old_map->infork = 0; vm_map_unlock(old_map); return (vm2); } int vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; vm_offset_t bot, top; vm_size_t init_ssize; int orient, rv; rlim_t vmemlim; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. * NOTE: We explicitly allow bi-directional stacks. */ orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP); cow &= ~orient; KASSERT(orient != 0, ("No stack grow direction")); if (addrbos < vm_map_min(map) || addrbos > map->max_offset) return (KERN_NO_SPACE); init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz; PROC_LOCK(curthread->td_proc); vmemlim = lim_cur(curthread->td_proc, RLIMIT_VMEM); PROC_UNLOCK(curthread->td_proc); vm_map_lock(map); /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) { vm_map_unlock(map); return (KERN_NO_SPACE); } /* If we would blow our VMEM resource limit, no go */ if (map->size + init_ssize > vmemlim) { vm_map_unlock(map); return (KERN_NO_SPACE); } /* * If we can't accomodate max_ssize in the current mapping, no go. * However, we need to be aware that subsequent user mappings might * map into the space we have reserved for stack, and currently this * space is not protected. * * Hopefully we will at least detect this condition when we try to * grow the stack. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < addrbos + max_ssize)) { vm_map_unlock(map); return (KERN_NO_SPACE); } /* * We initially map a stack of only init_ssize. We will grow as * needed later. Depending on the orientation of the stack (i.e. * the grow direction) we either map at the top of the range, the * bottom of the range or in the middle. * * Note: we would normally expect prot and max to be VM_PROT_ALL, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ if (orient == MAP_STACK_GROWS_DOWN) bot = addrbos + max_ssize - init_ssize; else if (orient == MAP_STACK_GROWS_UP) bot = addrbos; else bot = round_page(addrbos + max_ssize/2 - init_ssize/2); top = bot + init_ssize; rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); /* Now set the avail_ssize amount. */ if (rv == KERN_SUCCESS) { if (prev_entry != &map->header) vm_map_clip_end(map, prev_entry, bot); new_entry = prev_entry->next; if (new_entry->end != top || new_entry->start != bot) panic("Bad entry start/end for new stack entry"); new_entry->avail_ssize = max_ssize - init_ssize; if (orient & MAP_STACK_GROWS_DOWN) new_entry->eflags |= MAP_ENTRY_GROWS_DOWN; if (orient & MAP_STACK_GROWS_UP) new_entry->eflags |= MAP_ENTRY_GROWS_UP; } vm_map_unlock(map); return (rv); } /* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the * desired address is already mapped, or if we successfully grow * the stack. Also returns KERN_SUCCESS if addr is outside the * stack range (this is strange, but preserves compatibility with * the grow function in vm_machdep.c). */ int vm_map_growstack(struct proc *p, vm_offset_t addr) { vm_map_entry_t next_entry, prev_entry; vm_map_entry_t new_entry, stack_entry; struct vmspace *vm = p->p_vmspace; vm_map_t map = &vm->vm_map; vm_offset_t end; size_t grow_amount, max_grow; rlim_t stacklim, vmemlim; int is_procstack, rv; Retry: PROC_LOCK(p); stacklim = lim_cur(p, RLIMIT_STACK); vmemlim = lim_cur(p, RLIMIT_VMEM); PROC_UNLOCK(p); vm_map_lock_read(map); /* If addr is already in the entry range, no need to grow.*/ if (vm_map_lookup_entry(map, addr, &prev_entry)) { vm_map_unlock_read(map); return (KERN_SUCCESS); } next_entry = prev_entry->next; if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) { /* * This entry does not grow upwards. Since the address lies * beyond this entry, the next entry (if one exists) has to * be a downward growable entry. The entry list header is * never a growable entry, so it suffices to check the flags. */ if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) { vm_map_unlock_read(map); return (KERN_SUCCESS); } stack_entry = next_entry; } else { /* * This entry grows upward. If the next entry does not at * least grow downwards, this is the entry we need to grow. * otherwise we have two possible choices and we have to * select one. */ if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) { /* * We have two choices; grow the entry closest to * the address to minimize the amount of growth. */ if (addr - prev_entry->end <= next_entry->start - addr) stack_entry = prev_entry; else stack_entry = next_entry; } else stack_entry = prev_entry; } if (stack_entry == next_entry) { KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo")); KASSERT(addr < stack_entry->start, ("foo")); end = (prev_entry != &map->header) ? prev_entry->end : stack_entry->start - stack_entry->avail_ssize; grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE); max_grow = stack_entry->start - end; } else { KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo")); KASSERT(addr >= stack_entry->end, ("foo")); end = (next_entry != &map->header) ? next_entry->start : stack_entry->end + stack_entry->avail_ssize; grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE); max_grow = end - stack_entry->end; } if (grow_amount > stack_entry->avail_ssize) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } /* * If there is no longer enough space between the entries nogo, and * adjust the available space. Note: this should only happen if the * user has mapped into the stack area after the stack was created, * and is probably an error. * * This also effectively destroys any guard page the user might have * intended by limiting the stack size. */ if (grow_amount > max_grow) { if (vm_map_lock_upgrade(map)) goto Retry; stack_entry->avail_ssize = max_grow; vm_map_unlock(map); return (KERN_NO_SPACE); } is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0; /* * If this is the main process stack, see if we're over the stack * limit. */ if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } /* Round up the grow amount modulo SGROWSIZ */ grow_amount = roundup (grow_amount, sgrowsiz); if (grow_amount > stack_entry->avail_ssize) grow_amount = stack_entry->avail_ssize; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = stacklim - ctob(vm->vm_ssize); } /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { vm_map_unlock_read(map); return (KERN_NO_SPACE); } if (vm_map_lock_upgrade(map)) goto Retry; if (stack_entry == next_entry) { /* * Growing downward. */ /* Get the preliminary new entry start value */ addr = stack_entry->start - grow_amount; /* * If this puts us into the previous entry, cut back our * growth to the available space. Also, see the note above. */ if (addr < end) { stack_entry->avail_ssize = max_grow; addr = end; } rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start, p->p_sysent->sv_stackprot, VM_PROT_ALL, 0); /* Adjust the available stack space by the amount we grew. */ if (rv == KERN_SUCCESS) { if (prev_entry != &map->header) vm_map_clip_end(map, prev_entry, addr); new_entry = prev_entry->next; KASSERT(new_entry == stack_entry->prev, ("foo")); KASSERT(new_entry->end == stack_entry->start, ("foo")); KASSERT(new_entry->start == addr, ("foo")); grow_amount = new_entry->end - new_entry->start; new_entry->avail_ssize = stack_entry->avail_ssize - grow_amount; stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN; new_entry->eflags |= MAP_ENTRY_GROWS_DOWN; } } else { /* * Growing upward. */ addr = stack_entry->end + grow_amount; /* * If this puts us into the next entry, cut back our growth * to the available space. Also, see the note above. */ if (addr > end) { stack_entry->avail_ssize = end - stack_entry->end; addr = end; } grow_amount = addr - stack_entry->end; /* Grow the underlying object if applicable. */ if (stack_entry->object.vm_object == NULL || vm_object_coalesce(stack_entry->object.vm_object, OFF_TO_IDX(stack_entry->offset), (vm_size_t)(stack_entry->end - stack_entry->start), (vm_size_t)grow_amount)) { map->size += (addr - stack_entry->end); /* Update the current entry. */ stack_entry->end = addr; stack_entry->avail_ssize -= grow_amount; rv = KERN_SUCCESS; if (next_entry != &map->header) vm_map_clip_start(map, next_entry, addr); } else rv = KERN_FAILURE; } if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); vm_map_unlock(map); /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) { vm_map_wire(map, (stack_entry == next_entry) ? addr : addr - grow_amount, (stack_entry == next_entry) ? stack_entry->start : addr, (p->p_flag & P_SYSTEM) ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); } return (rv); } /* * Unshare the specified VM space for exec. If other processes are * mapped to it, then create a new one. The new vmspace is null. */ void vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; GIANT_REQUIRED; newvmspace = vmspace_alloc(minuser, maxuser); bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy, (caddr_t) &newvmspace->vm_endcopy - (caddr_t) &newvmspace->vm_startcopy); /* * This code is written like this for prototype purposes. The * goal is to avoid running down the vmspace here, but let the * other process's that are still using the vmspace to finally * run it down. Even though there is little or no chance of blocking * here, it is a good idea to keep this form for future mods. */ p->p_vmspace = newvmspace; if (p == curthread->td_proc) /* XXXKSE ? */ pmap_activate(curthread); vmspace_free(oldvmspace); } /* * Unshare the specified VM space for forcing COW. This * is called by rfork, for the (RFMEM|RFPROC) == 0 case. */ void vmspace_unshare(struct proc *p) { struct vmspace *oldvmspace = p->p_vmspace; struct vmspace *newvmspace; GIANT_REQUIRED; if (oldvmspace->vm_refcnt == 1) return; newvmspace = vmspace_fork(oldvmspace); p->p_vmspace = newvmspace; if (p == curthread->td_proc) /* XXXKSE ? */ pmap_activate(curthread); vmspace_free(oldvmspace); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_offset_t vaddr, vm_prot_t fault_typea, vm_map_entry_t *out_entry, /* OUT */ vm_object_t *object, /* OUT */ vm_pindex_t *pindex, /* OUT */ vm_prot_t *out_prot, /* OUT */ boolean_t *wired) /* OUT */ { vm_map_entry_t entry; vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; RetryLookup:; /* * Lookup the faulting address. */ vm_map_lock_read(map); #define RETURN(why) \ { \ vm_map_unlock_read(map); \ return (why); \ } /* * If the map has an interesting hint, try it before calling full * blown lookup routine. */ entry = map->root; *out_entry = entry; if (entry == NULL || (vaddr < entry->start) || (vaddr >= entry->end)) { /* * Entry was either not a valid hint, or the vaddr was not * contained in the entry, so do a full lookup. */ if (!vm_map_lookup_entry(map, vaddr, out_entry)) RETURN(KERN_INVALID_ADDRESS); entry = *out_entry; } /* * Handle submaps. */ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. * Note the special case for MAP_ENTRY_COW * pages with an override. This is to implement a forced * COW for debuggers. */ if (fault_type & VM_PROT_OVERRIDE_WRITE) prot = entry->max_protection; else prot = entry->protection; fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); if ((fault_type & prot) != fault_type) { RETURN(KERN_PROTECTION_FAILURE); } if ((entry->eflags & MAP_ENTRY_USER_WIRED) && (entry->eflags & MAP_ENTRY_COW) && (fault_type & VM_PROT_WRITE) && (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) { RETURN(KERN_PROTECTION_FAILURE); } /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) prot = fault_type = entry->protection; /* * If the entry was copy-on-write, we either ... */ if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { /* * If we want to write the page, we may as well handle that * now since we've got the map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if (fault_type & VM_PROT_WRITE) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the map to the new * object. */ if (vm_map_lock_upgrade(map)) goto RetryLookup; vm_object_shadow( &entry->object.vm_object, &entry->offset, atop(entry->end - entry->start)); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= ~VM_PROT_WRITE; } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL && !map->system_map) { if (vm_map_lock_upgrade(map)) goto RetryLookup; entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->offset = 0; vm_map_lock_downgrade(map); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); *object = entry->object.vm_object; /* * Return whether this is the only map sharing this data. */ *out_prot = prot; return (KERN_SUCCESS); #undef RETURN } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) { /* * Unlock the main-level map */ vm_map_unlock_read(map); } #include "opt_ddb.h" #ifdef DDB #include #include /* * vm_map_print: [ debug ] */ DB_SHOW_COMMAND(map, vm_map_print) { static int nlines; /* XXX convert args. */ vm_map_t map = (vm_map_t)addr; boolean_t full = have_addr; vm_map_entry_t entry; db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", (void *)map, (void *)map->pmap, map->nentries, map->timestamp); nlines++; if (!full && db_indent) return; db_indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { db_iprintf("map entry %p: start=%p, end=%p\n", (void *)entry, (void *)entry->start, (void *)entry->end); nlines++; { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; db_iprintf(" prot=%x/%x/%s", entry->protection, entry->max_protection, inheritance_name[(int)(unsigned char)entry->inheritance]); if (entry->wired_count != 0) db_printf(", wired"); } if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { db_printf(", share=%p, offset=0x%jx\n", (void *)entry->object.sub_map, (uintmax_t)entry->offset); nlines++; if ((entry->prev == &map->header) || (entry->prev->object.sub_map != entry->object.sub_map)) { db_indent += 2; vm_map_print((db_expr_t)(intptr_t) entry->object.sub_map, full, 0, (char *)0); db_indent -= 2; } } else { db_printf(", object=%p, offset=0x%jx", (void *)entry->object.vm_object, (uintmax_t)entry->offset); if (entry->eflags & MAP_ENTRY_COW) db_printf(", copy (%s)", (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); db_printf("\n"); nlines++; if ((entry->prev == &map->header) || (entry->prev->object.vm_object != entry->object.vm_object)) { db_indent += 2; vm_object_print((db_expr_t)(intptr_t) entry->object.vm_object, full, 0, (char *)0); nlines += 4; db_indent -= 2; } } } db_indent -= 2; if (db_indent == 0) nlines = 0; } DB_SHOW_COMMAND(procvm, procvm) { struct proc *p; if (have_addr) { p = (struct proc *) addr; } else { p = curproc; } db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, (void *)vmspace_pmap(p->p_vmspace)); vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL); } #endif /* DDB */ Index: head/sys/vm/vm_map.h =================================================================== --- head/sys/vm/vm_map.h (revision 127960) +++ head/sys/vm/vm_map.h (revision 127961) @@ -1,358 +1,354 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_map.h 8.9 (Berkeley) 5/17/95 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory map module definitions. */ #ifndef _VM_MAP_ #define _VM_MAP_ #include #include #include /* * Types defined: * * vm_map_t the high-level address map data structure. * vm_map_entry_t an entry in an address map. */ typedef u_char vm_flags_t; typedef u_int vm_eflags_t; /* * Objects which live in maps may be either VM objects, or * another map (called a "sharing map") which denotes read-write * sharing with other maps. */ union vm_map_object { struct vm_object *vm_object; /* object object */ struct vm_map *sub_map; /* belongs to another map */ }; /* * Address map entries consist of start and end addresses, * a VM object (or sharing map) and offset into that object, * and user-exported inheritance and protection information. * Also included is control information for virtual copy operations. */ struct vm_map_entry { struct vm_map_entry *prev; /* previous entry */ struct vm_map_entry *next; /* next entry */ struct vm_map_entry *left; /* left child in binary search tree */ struct vm_map_entry *right; /* right child in binary search tree */ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ vm_offset_t avail_ssize; /* amt can grow if this is a stack */ union vm_map_object object; /* object I point to */ vm_ooffset_t offset; /* offset into object */ vm_eflags_t eflags; /* map entry flags */ /* Only in task maps: */ vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ vm_pindex_t lastr; /* last read */ }; #define MAP_ENTRY_NOSYNC 0x0001 #define MAP_ENTRY_IS_SUB_MAP 0x0002 #define MAP_ENTRY_COW 0x0004 #define MAP_ENTRY_NEEDS_COPY 0x0008 #define MAP_ENTRY_NOFAULT 0x0010 #define MAP_ENTRY_USER_WIRED 0x0020 #define MAP_ENTRY_BEHAV_NORMAL 0x0000 /* default behavior */ #define MAP_ENTRY_BEHAV_SEQUENTIAL 0x0040 /* expect sequential access */ #define MAP_ENTRY_BEHAV_RANDOM 0x0080 /* expect random access */ #define MAP_ENTRY_BEHAV_RESERVED 0x00C0 /* future use */ #define MAP_ENTRY_BEHAV_MASK 0x00C0 #define MAP_ENTRY_IN_TRANSITION 0x0100 /* entry being changed */ #define MAP_ENTRY_NEEDS_WAKEUP 0x0200 /* waiters in transition */ #define MAP_ENTRY_NOCOREDUMP 0x0400 /* don't include in a core */ #define MAP_ENTRY_GROWS_DOWN 0x1000 /* Top-down stacks */ #define MAP_ENTRY_GROWS_UP 0x2000 /* Bottom-up stacks */ #ifdef _KERNEL static __inline u_char vm_map_entry_behavior(vm_map_entry_t entry) { return (entry->eflags & MAP_ENTRY_BEHAV_MASK); } #endif /* _KERNEL */ /* * A map is a set of map entries. These map entries are * organized both as a binary search tree and as a doubly-linked * list. Both structures are ordered based upon the start and * end addresses contained within each map entry. Sleator and * Tarjan's top-down splay algorithm is employed to control * height imbalance in the binary search tree. * * Note: the lock structure cannot be the first element of vm_map * because this can result in a running lockup between two or more * system processes trying to kmem_alloc_wait() due to kmem_alloc_wait() * and free tsleep/waking up 'map' and the underlying lockmgr also * sleeping and waking up on 'map'. The lockup occurs when the map fills * up. The 'exec' map, for example. * * List of locks * (c) const until freed */ struct vm_map { struct vm_map_entry header; /* List of entries */ struct lock lock; /* Lock for map data */ struct mtx system_mtx; int nentries; /* Number of entries */ vm_size_t size; /* virtual size */ u_int timestamp; /* Version number */ u_char needs_wakeup; u_char system_map; /* Am I a system map? */ u_char infork; /* Am I in fork processing? */ vm_flags_t flags; /* flags for this vm_map */ vm_map_entry_t root; /* Root of a binary search tree */ vm_map_entry_t first_free; /* First free space hint */ pmap_t pmap; /* (c) Physical map */ #define min_offset header.start /* (c) */ #define max_offset header.end /* (c) */ }; /* * vm_flags_t values */ #define MAP_WIREFUTURE 0x01 /* wire all future pages */ #ifdef _KERNEL static __inline vm_offset_t vm_map_max(vm_map_t map) { return (map->max_offset); } static __inline vm_offset_t vm_map_min(vm_map_t map) { return (map->min_offset); } static __inline pmap_t vm_map_pmap(vm_map_t map) { return (map->pmap); } static __inline void vm_map_modflags(vm_map_t map, vm_flags_t set, vm_flags_t clear) { map->flags = (map->flags | set) & ~clear; } #endif /* _KERNEL */ /* * Shareable process virtual address space. * * List of locks * (c) const until freed */ struct vmspace { struct vm_map vm_map; /* VM address map */ struct pmap vm_pmap; /* private physical map */ int vm_refcnt; /* number of references */ struct shmmap_state *vm_shm; /* SYS5 shared memory private data XXX */ /* we copy between vm_startcopy and vm_endcopy on fork */ #define vm_startcopy vm_rssize segsz_t vm_rssize; /* current resident set size in pages */ segsz_t vm_swrss; /* resident set size before last swap */ segsz_t vm_tsize; /* text size (pages) XXX */ segsz_t vm_dsize; /* data size (pages) XXX */ segsz_t vm_ssize; /* stack size (pages) */ caddr_t vm_taddr; /* (c) user virtual address of text */ caddr_t vm_daddr; /* (c) user virtual address of data */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ #define vm_endcopy vm_exitingcnt int vm_exitingcnt; /* several processes zombied in exit1 */ }; #ifdef _KERNEL static __inline pmap_t vmspace_pmap(struct vmspace *vmspace) { return &vmspace->vm_pmap; } #endif /* _KERNEL */ #ifdef _KERNEL /* * Macros: vm_map_lock, etc. * Function: * Perform locking on the data portion of a map. Note that * these macros mimic procedure calls returning void. The * semicolon is supplied by the user of these macros, not * by the macros themselves. The macros can safely be used * as unbraced elements in a higher level statement. */ void _vm_map_lock(vm_map_t map, const char *file, int line); void _vm_map_unlock(vm_map_t map, const char *file, int line); void _vm_map_lock_read(vm_map_t map, const char *file, int line); void _vm_map_unlock_read(vm_map_t map, const char *file, int line); int _vm_map_trylock(vm_map_t map, const char *file, int line); int _vm_map_trylock_read(vm_map_t map, const char *file, int line); int _vm_map_lock_upgrade(vm_map_t map, const char *file, int line); void _vm_map_lock_downgrade(vm_map_t map, const char *file, int line); int vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait); void vm_map_wakeup(vm_map_t map); #define vm_map_lock(map) _vm_map_lock(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock(map) _vm_map_unlock(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_read(map) _vm_map_lock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_unlock_read(map) _vm_map_unlock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_trylock(map) _vm_map_trylock(map, LOCK_FILE, LOCK_LINE) #define vm_map_trylock_read(map) \ _vm_map_trylock_read(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_upgrade(map) \ _vm_map_lock_upgrade(map, LOCK_FILE, LOCK_LINE) #define vm_map_lock_downgrade(map) \ _vm_map_lock_downgrade(map, LOCK_FILE, LOCK_LINE) long vmspace_resident_count(struct vmspace *vmspace); long vmspace_wired_count(struct vmspace *vmspace); #endif /* _KERNEL */ /* XXX: number of kernel maps and entries to statically allocate */ #define MAX_KMAP 10 #define MAX_KMAPENT 128 #define MAX_MAPENT 128 /* * Copy-on-write flags for vm_map operations */ #define MAP_UNUSED_01 0x0001 #define MAP_COPY_ON_WRITE 0x0002 #define MAP_NOFAULT 0x0004 #define MAP_PREFAULT 0x0008 #define MAP_PREFAULT_PARTIAL 0x0010 #define MAP_DISABLE_SYNCER 0x0020 #define MAP_DISABLE_COREDUMP 0x0100 #define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */ #define MAP_STACK_GROWS_DOWN 0x1000 #define MAP_STACK_GROWS_UP 0x2000 /* * vm_fault option flags */ #define VM_FAULT_NORMAL 0 /* Nothing special */ #define VM_FAULT_CHANGE_WIRING 1 /* Change the wiring as appropriate */ #define VM_FAULT_USER_WIRE 2 /* Likewise, but for user purposes */ #define VM_FAULT_WIRE_MASK (VM_FAULT_CHANGE_WIRING|VM_FAULT_USER_WIRE) #define VM_FAULT_DIRTY 8 /* Dirty the page */ /* * vm_map_wire and vm_map_unwire option flags */ #define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ #define VM_MAP_WIRE_USER 1 /* wiring in a user map */ #define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ #define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ #ifdef _KERNEL boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t); vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); int vm_map_delete (vm_map_t, vm_offset_t, vm_offset_t); int vm_map_find (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int); int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *); int vm_map_inherit (vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t); void vm_map_init (struct vm_map *, vm_offset_t, vm_offset_t); int vm_map_insert (vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int); int vm_map_lookup (vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *); void vm_map_lookup_done (vm_map_t, vm_map_entry_t); boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *); void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t); int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t); void vm_map_startup (void); int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t); int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int); void vm_map_simplify_entry (vm_map_t, vm_map_entry_t); void vm_init2 (void); int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); int vm_map_growstack (struct proc *p, vm_offset_t addr); int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vmspace_swap_count (struct vmspace *vmspace); #endif /* _KERNEL */ #endif /* _VM_MAP_ */ Index: head/sys/vm/vm_meter.c =================================================================== --- head/sys/vm/vm_meter.c (revision 127960) +++ head/sys/vm/vm_meter.c (revision 127961) @@ -1,368 +1,364 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vmmeter cnt; int maxslp = MAXSLP; SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, CTLFLAG_RW, &cnt.v_free_min, 0, ""); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, CTLFLAG_RW, &cnt.v_free_target, 0, ""); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, CTLFLAG_RW, &cnt.v_free_reserved, 0, ""); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, CTLFLAG_RW, &cnt.v_inactive_target, 0, ""); SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min, CTLFLAG_RW, &cnt.v_cache_min, 0, ""); SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max, CTLFLAG_RW, &cnt.v_cache_max, 0, ""); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &cnt.v_pageout_free_min, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &cnt.v_free_severe, 0, ""); SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, &averunnable, loadavg, "Machine loadaverage history"); static int vmtotal(SYSCTL_HANDLER_ARGS) { /* XXXKSE almost completely broken */ struct proc *p; struct vmtotal total, *totalp; vm_map_entry_t entry; vm_object_t object; vm_map_t map; int paging; struct thread *td; totalp = &total; bzero(totalp, sizeof *totalp); /* * Mark all objects as inactive. */ GIANT_REQUIRED; mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { if (!VM_OBJECT_TRYLOCK(object)) { /* * Avoid a lock-order reversal. Consequently, * the reported number of active pages may be * greater than the actual number. */ continue; } vm_object_clear_flag(object, OBJ_ACTIVE); VM_OBJECT_UNLOCK(object); } mtx_unlock(&vm_object_list_mtx); /* * Calculate process statistics. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_flag & P_SYSTEM) continue; mtx_lock_spin(&sched_lock); switch (p->p_state) { case PRS_NEW: mtx_unlock_spin(&sched_lock); continue; break; default: FOREACH_THREAD_IN_PROC(p, td) { /* Need new statistics XXX */ switch (td->td_state) { case TDS_INHIBITED: if (TD_ON_LOCK(td) || (td->td_inhibitors == TDI_SWAPPED)) { totalp->t_sw++; } else if (TD_IS_SLEEPING(td) || TD_AWAITING_INTR(td) || TD_IS_SUSPENDED(td)) { if (td->td_priority <= PZERO) totalp->t_dw++; else totalp->t_sl++; } break; case TDS_CAN_RUN: totalp->t_sw++; break; case TDS_RUNQ: case TDS_RUNNING: totalp->t_rq++; continue; default: break; } } } mtx_unlock_spin(&sched_lock); /* * Note active objects. */ paging = 0; map = &p->p_vmspace->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) || (object = entry->object.vm_object) == NULL) continue; VM_OBJECT_LOCK(object); vm_object_set_flag(object, OBJ_ACTIVE); paging |= object->paging_in_progress; VM_OBJECT_UNLOCK(object); } vm_map_unlock_read(map); if (paging) totalp->t_pw++; } sx_sunlock(&allproc_lock); /* * Calculate object memory usage statistics. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { /* * Perform unsynchronized reads on the object to avoid * a lock-order reversal. In this case, the lack of * synchronization should not impair the accuracy of * the reported statistics. */ if (object->type == OBJT_DEVICE) { /* * Devices, like /dev/mem, will badly skew our totals. */ continue; } totalp->t_vm += object->size; totalp->t_rm += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { totalp->t_avm += object->size; totalp->t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ totalp->t_vmshr += object->size; totalp->t_rmshr += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { totalp->t_avmshr += object->size; totalp->t_armshr += object->resident_page_count; } } } mtx_unlock(&vm_object_list_mtx); totalp->t_free = cnt.v_free_count + cnt.v_cache_count; return (sysctl_handle_opaque(oidp, totalp, sizeof total, req)); } /* * vcnt() - accumulate statistics from all cpus and the global cnt * structure. * * The vmmeter structure is now per-cpu as well as global. Those * statistics which can be kept on a per-cpu basis (to avoid cache * stalls between cpus) can be moved to the per-cpu vmmeter. Remaining * statistics, such as v_free_reserved, are left in the global * structure. * * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) */ static int vcnt(SYSCTL_HANDLER_ARGS) { int error = 0; int count = *(int *)arg1; #ifdef SMP int i; int offset = (char *)arg1 - (char *)&cnt; for (i = 0; i < mp_ncpus; ++i) { struct pcpu *pcpu = pcpu_find(i); count += *(int *)((char *)&pcpu->pc_cnt + offset); } #else int offset = (char *)arg1 - (char *)&cnt; count += *(int *)((char *)PCPU_PTR(cnt) + offset); #endif error = SYSCTL_OUT(req, &count, sizeof(int)); return(error); } SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", "System virtual memory statistics"); SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, "VM meter sys stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, "VM meter vm stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_swtch, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_swtch, 0, vcnt, "IU", "Context switches"); SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_trap, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_trap, 0, vcnt, "IU", "Traps"); SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_syscall, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_syscall, 0, vcnt, "IU", "Syscalls"); SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_intr, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_intr, 0, vcnt, "IU", "Hardware interrupts"); SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_soft, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_soft, 0, vcnt, "IU", "Software interrupts"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vm_faults, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vm_faults, 0, vcnt, "IU", "VM faults"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_faults, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_cow_faults, 0, vcnt, "IU", "COW faults"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_optim, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_cow_optim, 0, vcnt, "IU", "Optimized COW faults"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_zfod, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_zfod, 0, vcnt, "IU", "Zero fill"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_ozfod, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_ozfod, 0, vcnt, "IU", "Optimized zero fill"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapin, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_swapin, 0, vcnt, "IU", "Swapin operations"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapout, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_swapout, 0, vcnt, "IU", "Swapout operations"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsin, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_swappgsin, 0, vcnt, "IU", "Swapin pages"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsout, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_swappgsout, 0, vcnt, "IU", "Swapout pages"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodein, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vnodein, 0, vcnt, "IU", "Vnodein operations"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodeout, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vnodeout, 0, vcnt, "IU", "Vnodeout operations"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsin, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vnodepgsin, 0, vcnt, "IU", "Vnodein pages"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsout, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vnodepgsout, 0, vcnt, "IU", "Vnodeout pages"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_intrans, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_intrans, 0, vcnt, "IU", "In transit page blocking"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_reactivated, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_reactivated, 0, vcnt, "IU", "Reactivated pages"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdwakeups, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_pdwakeups, 0, vcnt, "IU", "Pagedaemon wakeups"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_pdpages, 0, vcnt, "IU", "Pagedaemon page scans"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_dfree, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pfree, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_pfree, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tfree, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_tfree, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_size, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_page_size, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_count, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_page_count, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_reserved, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_free_reserved, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_target, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_free_target, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_min, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_free_min, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_count, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_free_count, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_wire_count, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_wire_count, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_active_count, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_active_count, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_target, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_inactive_target, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_count, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_inactive_count, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_count, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_cache_count, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_min, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_cache_min, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_max, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_cache_max, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pageout_free_min, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_pageout_free_min, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_interrupt_free_min, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_interrupt_free_min, 0, vcnt, "IU", ""); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forks, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_forks, 0, vcnt, "IU", "Number of fork() calls"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforks, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vforks, 0, vcnt, "IU", "Number of vfork() calls"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforks, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_rforks, 0, vcnt, "IU", "Number of rfork() calls"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreads, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_kthreads, 0, vcnt, "IU", "Number of fork() calls by kernel"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forkpages, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_forkpages, 0, vcnt, "IU", "VM pages affected by fork()"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforkpages, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_vforkpages, 0, vcnt, "IU", "VM pages affected by vfork()"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforkpages, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_rforkpages, 0, vcnt, "IU", "VM pages affected by rfork()"); SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreadpages, CTLTYPE_UINT|CTLFLAG_RD, &cnt.v_kthreadpages, 0, vcnt, "IU", "VM pages affected by fork() by kernel"); SYSCTL_INT(_vm_stats_misc, OID_AUTO, zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, ""); #if 0 SYSCTL_INT(_vm_stats_misc, OID_AUTO, page_mask, CTLFLAG_RD, &page_mask, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, page_shift, CTLFLAG_RD, &page_shift, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, first_page, CTLFLAG_RD, &first_page, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, last_page, CTLFLAG_RD, &last_page, 0, ""); #endif Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 127960) +++ head/sys/vm/vm_mmap.c (revision 127961) @@ -1,1323 +1,1319 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 */ /* * Mapped file (mmap) interface to VM */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif static int max_proc_mmap; SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); /* * Set the maximum number of vm_map_entry structures per process. Roughly * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 * of our KVM malloc space still results in generous limits. We want a * default that is good enough to prevent the kernel running out of resources * if attacked from compromised user account but generous enough such that * multi-threaded processes are not unduly inconvenienced. */ static void vmmapentry_rsrc_init(void *); SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL) static void vmmapentry_rsrc_init(dummy) void *dummy; { max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry); max_proc_mmap /= 100; } static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct vnode *, vm_ooffset_t, vm_object_t *); /* * MPSAFE */ /* ARGSUSED */ int sbrk(td, uap) struct thread *td; struct sbrk_args *uap; { /* Not yet implemented */ /* mtx_lock(&Giant); */ /* mtx_unlock(&Giant); */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sstk(td, uap) struct thread *td; struct sstk_args *uap; { /* Not yet implemented */ /* mtx_lock(&Giant); */ /* mtx_unlock(&Giant); */ return (EOPNOTSUPP); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif /* ARGSUSED */ int ogetpagesize(td, uap) struct thread *td; struct getpagesize_args *uap; { /* MP SAFE */ td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. * * Block devices can be mmap'd no matter what they represent. Cache coherency * is maintained as long as you do not write directly to the underlying * character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif /* * MPSAFE */ int mmap(td, uap) struct thread *td; struct mmap_args *uap; { struct file *fp; struct vnode *vp; vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot, maxprot; void *handle; int flags, error; off_t pos; struct vmspace *vms = td->td_proc->p_vmspace; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; flags = uap->flags; pos = uap->pos; fp = NULL; /* make sure mapping fits into numeric range etc */ if ((ssize_t) uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) return (EINVAL); if (flags & MAP_STACK) { if ((uap->fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t) round_page(size); /* hi end */ /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (addr < vm_map_min(&vms->vm_map) || addr + size > vm_map_max(&vms->vm_map)) return (EINVAL); if (addr + size < addr) return (EINVAL); } else { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ PROC_LOCK(td->td_proc); if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td->td_proc, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td->td_proc, RLIMIT_DATA)); PROC_UNLOCK(td->td_proc); } if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; pos = 0; } else { /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. * don't let the descriptor disappear on us if we block */ if ((error = fget(td, uap->fd, &fp)) != 0) goto done; if (fp->f_type != DTYPE_VNODE) { error = EINVAL; goto done; } /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if (fp->f_flag & FPOSIXSHM) flags |= MAP_NOSYNC; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ if (vp->v_mount->mnt_flag & MNT_NOEXEC) maxprot = VM_PROT_NONE; else maxprot = VM_PROT_EXECUTE; if (fp->f_flag & FREAD) { maxprot |= VM_PROT_READ; } else if (prot & PROT_READ) { error = EACCES; goto done; } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) { maxprot |= VM_PROT_WRITE; } else if ((prot & PROT_WRITE) != 0) { error = EACCES; goto done; } } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { maxprot |= VM_PROT_WRITE; } handle = (void *)vp; } /* * Do not allow more then a certain number of vm_map_entry structures * per process. Scale with the number of rforks sharing the map * to make the limit reasonable for threads. */ if (max_proc_mmap && vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) { error = ENOMEM; goto done; } error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, flags, handle, pos); if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); done: if (fp) fdrop(fp, td); return (error); } #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(td, uap) struct thread *td; struct ommap_args *uap; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot & 0x7]; nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; nargs.fd = uap->fd; nargs.pos = uap->pos; return (mmap(td, &nargs)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; int len; int flags; }; #endif /* * MPSAFE */ int msync(td, uap) struct thread *td; struct msync_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; int flags; vm_map_t map; int rv; addr = (vm_offset_t) uap->addr; size = uap->len; flags = uap->flags; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &td->td_proc->p_vmspace->vm_map; /* * Clean the pages and interpret the return value. */ rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_INVALID_ARGUMENT: return (EBUSY); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif /* * MPSAFE */ int munmap(td, uap) struct thread *td; struct munmap_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_map_t map; addr = (vm_offset_t) uap->addr; size = uap->len; if (size == 0) return (EINVAL); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... */ map = &td->td_proc->p_vmspace->vm_map; if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) return (EINVAL); vm_map_lock(map); /* * Make sure entire range is allocated. */ if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { vm_map_unlock(map); return (EINVAL); } /* returns nothing but KERN_SUCCESS anyway */ vm_map_delete(map, addr, addr + size); vm_map_unlock(map); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif /* * MPSAFE */ int mprotect(td, uap) struct thread *td; struct mprotect_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif /* * MPSAFE */ int minherit(td, uap) struct thread *td; struct minherit_args *uap; { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr + size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif /* * MPSAFE */ /* ARGSUSED */ int madvise(td, uap) struct thread *td; struct madvise_args *uap; { vm_offset_t start, end; vm_map_t map; struct proc *p; int error; /* * Check for our special case, advising the swap pager we are * "immortal." */ if (uap->behav == MADV_PROTECT) { error = suser(td); if (error == 0) { p = td->td_proc; PROC_LOCK(p); p->p_flag |= P_PROTECTED; PROC_UNLOCK(p); } return (error); } /* * Check for illegal behavior */ if (uap->behav < 0 || uap->behav > MADV_CORE) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; if ((vm_offset_t)uap->addr < vm_map_min(map) || (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) return (EINVAL); if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page((vm_offset_t) uap->addr); end = round_page((vm_offset_t) uap->addr + uap->len); if (vm_map_madvise(map, start, end, uap->behav)) return (EINVAL); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif /* * MPSAFE */ /* ARGSUSED */ int mincore(td, uap) struct thread *td; struct mincore_args *uap; { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error = 0; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; int mincoreinfo; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (EINVAL); /* * Address of byte vector */ vec = uap->vec; pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) entry = entry->next; /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for (current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ mtx_lock(&Giant); mincoreinfo = pmap_mincore(pmap, addr); mtx_unlock(&Giant); if (!mincoreinfo) { vm_pindex_t pindex; vm_ooffset_t offset; vm_page_t m; /* * calculate the page index into the object */ offset = current->offset + (addr - current->start); pindex = OFF_TO_IDX(offset); VM_OBJECT_LOCK(current->object.vm_object); m = vm_page_lookup(current->object.vm_object, pindex); /* * if the page is resident, then gather information about * it. */ if (m != NULL && m->valid != 0) { mincoreinfo = MINCORE_INCORE; vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) mincoreinfo |= MINCORE_MODIFIED_OTHER; if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { vm_page_flag_set(m, PG_REFERENCED); mincoreinfo |= MINCORE_REFERENCED_OTHER; } vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(current->object.vm_object); } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } ++lastvecindex; } /* * Pass the page information to the user */ error = subyte(vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while ((lastvecindex + 1) < vecindex) { error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } ++lastvecindex; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif /* * MPSAFE */ int mlock(td, uap) struct thread *td; struct mlock_args *uap; { struct proc *proc; vm_offset_t addr, end, last, start; vm_size_t npages, size; int error; error = suser(td); if (error) return (error); addr = (vm_offset_t)uap->addr; size = uap->len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); proc = td->td_proc; PROC_LOCK(proc); if (ptoa(npages + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > lim_cur(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); if (npages + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif /* * MPSAFE */ int mlockall(td, uap) struct thread *td; struct mlockall_args *uap; { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = 0; if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) return (EINVAL); #if 0 /* * If wiring all pages in the process would cause it to exceed * a hard resource limit, return ENOMEM. */ PROC_LOCK(td->td_proc); if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) > lim_cur(td->td_proc, RLIMIT_MEMLOCK))) { PROC_UNLOCK(td->td_proc); return (ENOMEM); } PROC_UNLOCK(td->td_proc); #else error = suser(td); if (error) return (error); #endif if (uap->how & MCL_FUTURE) { vm_map_lock(map); vm_map_modflags(map, MAP_WIREFUTURE, 0); vm_map_unlock(map); error = 0; } if (uap->how & MCL_CURRENT) { /* * P1003.1-2001 mandates that all currently mapped pages * will be memory resident and locked (wired) upon return * from mlockall(). vm_map_wire() will wire pages, by * calling vm_fault_wire() for each page in the region. */ error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); error = (error == KERN_SUCCESS ? 0 : EAGAIN); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlockall_args { register_t dummy; }; #endif /* * MPSAFE */ int munlockall(td, uap) struct thread *td; struct munlockall_args *uap; { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = suser(td); if (error) return (error); /* Clear the MAP_WIREFUTURE flag from this vm_map. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif /* * MPSAFE */ int munlock(td, uap) struct thread *td; struct munlock_args *uap; { vm_offset_t addr, end, last, start; vm_size_t size; int error; error = suser(td); if (error) return (error); addr = (vm_offset_t)uap->addr; size = uap->len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap_vnode() * * MPSAFE * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp) { struct vattr va; void *handle; vm_object_t obj; int disablexworkaround, error, flags, type; mtx_lock(&Giant); if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) { mtx_unlock(&Giant); return (error); } flags = *flagsp; if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (VOP_GETVOBJECT(vp, &obj) != 0) { error = EINVAL; goto done; } if (obj->handle != vp) { vput(vp); vp = (struct vnode*)obj->handle; vget(vp, LK_EXCLUSIVE, td); } type = OBJT_VNODE; handle = vp; } else if (vp->v_type == VCHR) { type = OBJT_DEVICE; handle = vp->v_rdev; if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) { *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; error = 0; goto done; } /* * cdevs does not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && (prot & PROT_WRITE) != 0) { error = EACCES; goto done; } /* * However, for XIG X server to continue to work, * we should allow the superuser to do it anyway. * We only allow it at securelevel < 1. * (Because the XIG X server writes directly to video * memory via /dev/mem, it should never work at any * other securelevel. * XXX this will have to go */ if (securelevel_ge(td->td_ucred, 1)) disablexworkaround = 1; else disablexworkaround = suser(td); if (disablexworkaround && (flags & (MAP_PRIVATE|MAP_COPY))) { error = EINVAL; goto done; } /* * Force device mappings to be shared. */ flags &= ~(MAP_PRIVATE|MAP_COPY); flags |= MAP_SHARED; } else { error = EINVAL; goto done; } if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) { goto done; } if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { if (prot & PROT_WRITE) { error = EPERM; goto done; } *maxprotp &= ~VM_PROT_WRITE; } #ifdef MAC error = mac_check_vnode_mmap(td->td_ucred, vp, prot); if (error != 0) goto done; #endif } /* * If it is a regular file without any references * we do not need to sync it. * Adjust object size to be the size of actual file. */ if (vp->v_type == VREG) { objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; } obj = vm_pager_allocate(type, handle, objsize, prot, foff); if (obj == NULL) { error = (type == OBJT_DEVICE ? EINVAL : ENOMEM); goto done; } *objp = obj; *flagsp = flags; done: vput(vp); mtx_unlock(&Giant); return (error); } /* * vm_mmap() * * MPSAFE * * Internal version of mmap. Currently used by mmap, exec, and sys5 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) { boolean_t fitit; vm_object_t object; int rv = KERN_SUCCESS; vm_ooffset_t objsize; int docow, error; struct thread *td = curthread; if (size == 0) return (0); objsize = size = round_page(size); PROC_LOCK(td->td_proc); if (td->td_proc->p_vmspace->vm_map.size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { PROC_UNLOCK(td->td_proc); return(ENOMEM); } PROC_UNLOCK(td->td_proc); /* * We currently can only deal with page aligned file offsets. * The check is here rather than in the syscall because the * kernel calls this function internally for other mmaping * operations (such as in exec) and non-aligned offsets will * cause pmap inconsistencies...so we want to be sure to * disallow this in all cases. */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; (void) vm_map_remove(map, *addr, *addr + size); } /* * Lookup/allocate object. */ if (handle != NULL) { error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, handle, foff, &object); if (error) { return (error); } } if (flags & MAP_ANON) { object = NULL; docow = 0; /* * Unnamed anonymous regions always start at 0. */ if (handle == 0) foff = 0; } else { docow = MAP_PREFAULT_PARTIAL; } if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif if (fitit) *addr = pmap_addr_hint(object, *addr, size); if (flags & MAP_STACK) rv = vm_map_stack(map, *addr, size, prot, maxprot, docow | MAP_STACK_GROWS_DOWN); else rv = vm_map_find(map, object, foff, addr, size, fitit, prot, maxprot, docow); if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. */ vm_object_deallocate(object); } else if (flags & MAP_SHARED) { /* * Shared memory is also shared with children. */ rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) (void) vm_map_remove(map, *addr, *addr + size); } /* * If the process has requested that all future mappings * be wired, then heed this. */ if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) vm_map_wire(map, *addr, *addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 127960) +++ head/sys/vm/vm_object.c (revision 127961) @@ -1,2173 +1,2169 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define EASY_SCAN_FACTOR 8 #define MSYNC_FLUSH_HARDSEQ 0x01 #define MSYNC_FLUSH_SOFTSEQ 0x02 /* * msync / VM object flushing optimizations */ static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ; SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags, CTLFLAG_RW, &msync_flush_flags, 0, ""); static void vm_object_qcollapse(vm_object_t object); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static long object_collapses; static long object_bypasses; static int next_index; static uma_zone_t obj_zone; #define VM_OBJECTS_INIT 256 static void vm_object_zinit(void *mem, int size); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages", object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); } #endif static void vm_object_zinit(void *mem, int size) { vm_object_t object; object = (vm_object_t)mem; bzero(&object->mtx, sizeof(object->mtx)); VM_OBJECT_LOCK_INIT(object); /* These are true for any object that has been freed */ object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; } void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { int incr; TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->root = NULL; object->type = type; object->size = size; object->generation = 1; object->ref_count = 1; object->flags = 0; if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) object->flags = OBJ_ONEMAPPING; if (size > (PQ_L2_SIZE / 3 + PQ_PRIME1)) incr = PQ_L2_SIZE / 3 + PQ_PRIME1; else incr = size; do object->pg_color = next_index; while (!atomic_cmpset_int(&next_index, object->pg_color, (object->pg_color + incr) & PQ_L2_MASK)); object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); VM_OBJECT_LOCK_INIT(&kernel_object_store); _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); /* * The kmem object's mutex is given a unique name, instead of * "vm object", to avoid false reports of lock-order reversal * with a system map mutex. */ mtx_init(VM_OBJECT_MTX(kmem_object), "kmem object", NULL, MTX_DEF); _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(obj_zone, VM_OBJECTS_INIT); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->flags &= ~bits; } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0); } } /* * vm_object_allocate_wait * * Return a new object with the given size, and give the user the * option of waiting for it to complete or failing if the needed * memory isn't available. */ vm_object_t vm_object_allocate_wait(objtype_t type, vm_pindex_t size, int flags) { vm_object_t result; result = (vm_object_t) uma_zalloc(obj_zone, flags); if (result != NULL) _vm_object_allocate(type, size, result); return (result); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { return(vm_object_allocate_wait(type, size, M_WAITOK)); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { struct vnode *vp; int flags; if (object == NULL) return; VM_OBJECT_LOCK(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; VI_LOCK(vp); VM_OBJECT_UNLOCK(object); for (flags = LK_INTERLOCK; vget(vp, flags, curthread); flags = 0) printf("vm_object_reference: delay in vget\n"); } else VM_OBJECT_UNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference_locked: dead object referenced")); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; GIANT_REQUIRED; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vprint("vm_object_vndeallocate", vp); panic("vm_object_vndeallocate: bad object reference count"); } #endif object->ref_count--; if (object->ref_count == 0) { mp_fixme("Unlocked vflag access."); vp->v_vflag &= ~VV_TEXT; } VM_OBJECT_UNLOCK(object); /* * vrele may need a vop lock */ vrele(vp); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; while (object != NULL) { /* * In general, the object should be locked when working with * its type. In this case, in order to maintain proper lock * ordering, an exception is possible because a vnode-backed * object never changes its type. */ if (object->type == OBJT_VNODE) mtx_lock(&Giant); VM_OBJECT_LOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); mtx_unlock(&Giant); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_UNLOCK(object); return; } else if (object->ref_count == 1) { if (object->shadow_count == 0) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); if (!VM_OBJECT_TRYLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_UNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ tsleep(&proc0, PVM, "vmo_de", 1); continue; } if ((robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_UNLOCK(object); vm_object_pip_wait(robject, "objde1"); VM_OBJECT_LOCK(object); goto retry; } else if (object->paging_in_progress) { VM_OBJECT_UNLOCK(robject); object->flags |= OBJ_PIPWNT; msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "objde2", 0); VM_OBJECT_LOCK(robject); VM_OBJECT_LOCK(object); goto retry; } VM_OBJECT_UNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_UNLOCK(object); continue; } VM_OBJECT_UNLOCK(robject); } VM_OBJECT_UNLOCK(object); return; } doterm: temp = object->backing_object; if (temp != NULL) { VM_OBJECT_LOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; temp->generation++; VM_OBJECT_UNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_UNLOCK(object); object = temp; } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p; int s; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); VM_OBJECT_LOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Now free any remaining pages. For internal objects, this also * removes them from paging queues. Don't free wired pages, just * remove them from the object. */ s = splvm(); vm_page_lock_queues(); while ((p = TAILQ_FIRST(&object->memq)) != NULL) { KASSERT(!p->busy && (p->flags & PG_BUSY) == 0, ("vm_object_terminate: freeing busy page %p " "p->busy = %d, p->flags %x\n", p, p->busy, p->flags)); if (p->wire_count == 0) { vm_page_busy(p); vm_page_free(p); cnt.v_pfree++; } else { vm_page_busy(p); vm_page_remove(p); } } vm_page_unlock_queues(); splx(s); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_UNLOCK(object); /* * Remove the object from the global object list. */ mtx_lock(&vm_object_list_mtx); TAILQ_REMOVE(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); wakeup(object); /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. */ void vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags) { vm_page_t p, np; vm_pindex_t tstart, tend; vm_pindex_t pi; int clearobjflags; int pagerflags; int curgeneration; GIANT_REQUIRED; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->type != OBJT_VNODE || (object->flags & OBJ_MIGHTBEDIRTY) == 0) return; pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; vm_object_set_flag(object, OBJ_CLEANING); tstart = start; if (end == 0) { tend = object->size; } else { tend = end; } vm_page_lock_queues(); /* * If the caller is smart and only msync()s a range he knows is * dirty, we may be able to avoid an object scan. This results in * a phenominal improvement in performance. We cannot do this * as a matter of course because the object may be huge - e.g. * the size might be in the gigabytes or terrabytes. */ if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) { vm_pindex_t tscan; int scanlimit; int scanreset; scanreset = object->resident_page_count / EASY_SCAN_FACTOR; if (scanreset < 16) scanreset = 16; pagerflags |= VM_PAGER_IGNORE_CLEANCHK; scanlimit = scanreset; tscan = tstart; while (tscan < tend) { curgeneration = object->generation; p = vm_page_lookup(object, tscan); if (p == NULL || p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { if (--scanlimit == 0) break; ++tscan; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { if (--scanlimit == 0) break; ++tscan; continue; } /* * If we have been asked to skip nosync pages and * this is a nosync page, we can't continue. */ if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { if (--scanlimit == 0) break; ++tscan; continue; } scanlimit = scanreset; /* * This returns 0 if it was unable to busy the first * page (i.e. had to sleep). */ tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags); } /* * If everything was dirty and we flushed it successfully, * and the requested range is not the entire object, we * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can * return immediately. */ if (tscan >= tend && (tstart || tend < object->size)) { vm_page_unlock_queues(); vm_object_clear_flag(object, OBJ_CLEANING); return; } pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK; } /* * Generally set CLEANCHK interlock and make the page read-only so * we can then clear the object flags. * * However, if this is a nosync mmap then the object is likely to * stay dirty so do not mess with the page and do not clear the * object flags. */ clearobjflags = 1; TAILQ_FOREACH(p, &object->memq, listq) { vm_page_flag_set(p, PG_CLEANCHK); if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) clearobjflags = 0; else pmap_page_protect(p, VM_PROT_READ); } if (clearobjflags && (tstart == 0) && (tend == object->size)) { struct vnode *vp; vm_object_clear_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); if (object->type == OBJT_VNODE && (vp = (struct vnode *)object->handle) != NULL) { VI_LOCK(vp); if (vp->v_iflag & VI_OBJDIRTY) vp->v_iflag &= ~VI_OBJDIRTY; VI_UNLOCK(vp); } } rescan: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p; p = np) { int n; np = TAILQ_NEXT(p, listq); again: pi = p->pindex; if (((p->flags & PG_CLEANCHK) == 0) || (pi < tstart) || (pi >= tend) || (p->valid == 0) || ((p->queue - p->pc) == PQ_CACHE)) { vm_page_flag_clear(p, PG_CLEANCHK); continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { vm_page_flag_clear(p, PG_CLEANCHK); continue; } /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were * not cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { vm_page_flag_clear(p, PG_CLEANCHK); continue; } n = vm_object_page_collect_flush(object, p, curgeneration, pagerflags); if (n == 0) goto rescan; if (object->generation != curgeneration) goto rescan; /* * Try to optimize the next page. If we can't we pick up * our (random) scan where we left off. */ if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) { if ((p = vm_page_lookup(object, pi + n)) != NULL) goto again; } } vm_page_unlock_queues(); #if 0 VOP_FSYNC(vp, NULL, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc); #endif vm_object_clear_flag(object, OBJ_CLEANING); return; } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags) { int runlen; int s; int maxf; int chkb; int maxb; int i; vm_pindex_t pi; vm_page_t maf[vm_pageout_page_count]; vm_page_t mab[vm_pageout_page_count]; vm_page_t ma[vm_pageout_page_count]; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); pi = p->pindex; while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) { vm_page_lock_queues(); if (object->generation != curgeneration) { splx(s); return(0); } } maxf = 0; for(i = 1; i < vm_pageout_page_count; i++) { vm_page_t tp; if ((tp = vm_page_lookup(object, pi + i)) != NULL) { if ((tp->flags & PG_BUSY) || ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && (tp->flags & PG_CLEANCHK) == 0) || (tp->busy != 0)) break; if((tp->queue - tp->pc) == PQ_CACHE) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } maf[ i - 1 ] = tp; maxf++; continue; } break; } maxb = 0; chkb = vm_pageout_page_count - maxf; if (chkb) { for(i = 1; i < chkb;i++) { vm_page_t tp; if ((tp = vm_page_lookup(object, pi - i)) != NULL) { if ((tp->flags & PG_BUSY) || ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && (tp->flags & PG_CLEANCHK) == 0) || (tp->busy != 0)) break; if ((tp->queue - tp->pc) == PQ_CACHE) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { vm_page_flag_clear(tp, PG_CLEANCHK); break; } mab[ i - 1 ] = tp; maxb++; continue; } break; } } for(i = 0; i < maxb; i++) { int index = (maxb - i) - 1; ma[index] = mab[i]; vm_page_flag_clear(ma[index], PG_CLEANCHK); } vm_page_flag_clear(p, PG_CLEANCHK); ma[maxb] = p; for(i = 0; i < maxf; i++) { int index = (maxb + i) + 1; ma[index] = maf[i]; vm_page_flag_clear(ma[index], PG_CLEANCHK); } runlen = maxb + maxf + 1; splx(s); vm_pageout_flush(ma, runlen, pagerflags); for (i = 0; i < runlen; i++) { if (ma[i]->valid & ma[i]->dirty) { pmap_page_protect(ma[i], VM_PROT_READ); vm_page_flag_set(ma[i], PG_CLEANCHK); /* * maxf will end up being the actual number of pages * we wrote out contiguously, non-inclusive of the * first page. We do not count look-behind pages. */ if (i >= maxb + 1 && (maxf > i - maxb - 1)) maxf = i - maxb - 1; } } return(maxf + 1); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ void vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; int flags; if (object == NULL) return; VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); VM_OBJECT_UNLOCK(object); object = backing_object; offset += object->backing_object_offset; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_UNLOCK(object); mtx_lock(&Giant); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? OBJPC_INVAL : 0; VM_OBJECT_LOCK(object); vm_object_page_clean(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0, curthread); mtx_unlock(&Giant); VM_OBJECT_LOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), FALSE); } VM_OBJECT_UNLOCK(object); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise) { vm_pindex_t end, tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; end = pindex + count; /* * Locate and adjust resident pages */ for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; VM_OBJECT_LOCK(tobject); shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ if (advise == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } m = vm_page_lookup(tobject, tpindex); if (m == NULL) { /* * There may be swap even if there is no backing page */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* * next object */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_LOCK(backing_object); VM_OBJECT_UNLOCK(tobject); tobject = backing_object; tpindex += OFF_TO_IDX(tobject->backing_object_offset); goto shadowlookup; } /* * If the page is busy or not in a normal active state, * we skip it. If the page is not managed there are no * page queues to mess with. Things can break if we mess * with pages in any of the below states. */ vm_page_lock_queues(); if (m->hold_count || m->wire_count || (m->flags & PG_UNMANAGED) || m->valid != VM_PAGE_BITS_ALL) { vm_page_unlock_queues(); goto unlock_tobject; } if (vm_page_sleep_if_busy(m, TRUE, "madvpo")) { VM_OBJECT_UNLOCK(tobject); goto relookup; } if (advise == MADV_WILLNEED) { vm_page_activate(m); } else if (advise == MADV_DONTNEED) { vm_page_dontneed(m); } else if (advise == MADV_FREE) { /* * Mark the page clean. This will allow the page * to be freed up by the system. However, such pages * are often reused quickly by malloc()/free() * so we do not do anything that would cause * a page fault if we can help it. * * Specifically, we do not try to actually free * the page now nor do we try to put it in the * cache (which would cause a page fault on reuse). * * But we do make the page is freeable as we * can without actually taking the step of unmapping * it. */ pmap_clear_modify(m); m->dirty = 0; m->act_count = 0; vm_page_dontneed(m); } vm_page_unlock_queues(); if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: VM_OBJECT_UNLOCK(tobject); } } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_LOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_UNLOCK(source); return; } VM_OBJECT_UNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, length); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; if (source != NULL) { VM_OBJECT_LOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; source->generation++; if (length < source->size) length = source->size; if (length > PQ_L2_SIZE / 3 + PQ_PRIME1 || source->generation > 1) length = PQ_L2_SIZE / 3 + PQ_PRIME1; result->pg_color = (source->pg_color + length * source->generation) & PQ_L2_MASK; VM_OBJECT_UNLOCK(source); next_index = (result->pg_color + PQ_L2_SIZE / 3 + PQ_PRIME1) & PQ_L2_MASK; } /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m; vm_object_t orig_object, new_object, source; vm_pindex_t offidxstart, offidxend; vm_size_t idx, size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_UNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); offidxend = offidxstart + OFF_TO_IDX(entry->end - entry->start); size = offidxend - offidxstart; /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); VM_OBJECT_LOCK(new_object); VM_OBJECT_LOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_LOCK(source); LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; source->generation++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_UNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } for (idx = 0; idx < size; idx++) { retry: m = vm_page_lookup(orig_object, offidxstart + idx); if (m == NULL) continue; /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ vm_page_lock_queues(); if ((m->flags & PG_BUSY) || m->busy) { vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); VM_OBJECT_UNLOCK(orig_object); VM_OBJECT_UNLOCK(new_object); msleep(m, &vm_page_queue_mtx, PDROP | PVM, "spltwt", 0); VM_OBJECT_LOCK(new_object); VM_OBJECT_LOCK(orig_object); goto retry; } vm_page_busy(m); vm_page_rename(m, new_object, idx); /* page automatically made dirty by rename and cache handled */ vm_page_busy(m); vm_page_unlock_queues(); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); } VM_OBJECT_UNLOCK(orig_object); vm_page_lock_queues(); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_wakeup(m); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_LOCK(new_object); } #define OBSC_TEST_ALL_SHADOWED 0x0001 #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static int vm_object_backing_scan(vm_object_t object, int op) { int s; int r = 1; vm_page_t p; vm_object_t backing_object; vm_pindex_t backing_offset_index; s = splvm(); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if (op & OBSC_TEST_ALL_SHADOWED) { /* * We do not want to have to test for the existence of * swap pages in the backing object. XXX but with the * new swapper this would be pretty easy to do. * * XXX what about anonymous MAP_SHARED memory that hasn't * been ZFOD faulted yet? If we do not test for this, the * shadow test may succeed! XXX */ if (backing_object->type != OBJT_DEFAULT) { splx(s); return (0); } } if (op & OBSC_COLLAPSE_WAIT) { vm_object_set_flag(backing_object, OBJ_DEAD); } /* * Our scan */ p = TAILQ_FIRST(&backing_object->memq); while (p) { vm_page_t next = TAILQ_NEXT(p, listq); vm_pindex_t new_pindex = p->pindex - backing_offset_index; if (op & OBSC_TEST_ALL_SHADOWED) { vm_page_t pp; /* * Ignore pages outside the parent object's range * and outside the parent object's mapping of the * backing object. * * note that we do not busy the backing object's * page. */ if ( p->pindex < backing_offset_index || new_pindex >= object->size ) { p = next; continue; } /* * See if the parent has the page or if the parent's * object pager has the page. If the parent has the * page but the page is not valid, the parent's * object pager must have the page. * * If this fails, the parent does not completely shadow * the object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ( (pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL) ) { r = 0; break; } } /* * Check for busy page */ if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) { vm_page_t pp; vm_page_lock_queues(); if (op & OBSC_COLLAPSE_NOWAIT) { if ((p->flags & PG_BUSY) || !p->valid || p->hold_count || p->wire_count || p->busy) { vm_page_unlock_queues(); p = next; continue; } } else if (op & OBSC_COLLAPSE_WAIT) { if ((p->flags & PG_BUSY) || p->busy) { vm_page_flag_set(p, PG_WANTED | PG_REFERENCED); VM_OBJECT_UNLOCK(backing_object); VM_OBJECT_UNLOCK(object); msleep(p, &vm_page_queue_mtx, PDROP | PVM, "vmocol", 0); VM_OBJECT_LOCK(object); VM_OBJECT_LOCK(backing_object); /* * If we slept, anything could have * happened. Since the object is * marked dead, the backing offset * should not have changed so we * just restart our scan. */ p = TAILQ_FIRST(&backing_object->memq); continue; } } /* * Busy the page */ vm_page_busy(p); vm_page_unlock_queues(); KASSERT( p->object == backing_object, ("vm_object_qcollapse(): object mismatch") ); /* * Destroy any associated swap */ if (backing_object->type == OBJT_SWAP) { swap_pager_freespace( backing_object, p->pindex, 1 ); } if ( p->pindex < backing_offset_index || new_pindex >= object->size ) { /* * Page is out of the parent object's range, we * can simply destroy it. */ vm_page_lock_queues(); pmap_remove_all(p); vm_page_free(p); vm_page_unlock_queues(); p = next; continue; } pp = vm_page_lookup(object, new_pindex); if ( pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL) ) { /* * page already exists in parent OR swap exists * for this location in the parent. Destroy * the original page from the backing object. * * Leave the parent's page alone */ vm_page_lock_queues(); pmap_remove_all(p); vm_page_free(p); vm_page_unlock_queues(); p = next; continue; } /* * Page does not exist in parent, rename the * page from the backing object to the main object. * * If the page was mapped to a process, it can remain * mapped through the rename. */ vm_page_lock_queues(); vm_page_rename(p, object, new_pindex); vm_page_unlock_queues(); /* page automatically made dirty by rename */ } p = next; } splx(s); return (r); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED); if (backing_object->ref_count != 1) return; backing_object->ref_count += 2; vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT); backing_object->ref_count -= 2; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); while (TRUE) { vm_object_t backing_object; /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_LOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_UNLOCK(backing_object); break; } if ( object->paging_in_progress != 0 || backing_object->paging_in_progress != 0 ) { vm_object_qcollapse(object); VM_OBJECT_UNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_backing_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT); /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; backing_object->generation++; if (backing_object->backing_object) { VM_OBJECT_LOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ backing_object->backing_object->generation++; VM_OBJECT_UNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object)); VM_OBJECT_UNLOCK(backing_object); mtx_lock(&vm_object_list_mtx); TAILQ_REMOVE( &vm_object_list, backing_object, object_list ); mtx_unlock(&vm_object_list_mtx); uma_zfree(obj_zone, backing_object); object_collapses++; } else { vm_object_t new_backing_object; /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (vm_object_backing_scan(object, OBSC_TEST_ALL_SHADOWED) == 0) { VM_OBJECT_UNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; backing_object->generation++; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_LOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; new_backing_object->generation++; vm_object_reference_locked(new_backing_object); VM_OBJECT_UNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_UNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * Removes all physical pages in the given range from the * object's list of pages. If the range's end is zero, all * physical pages from the range's start to the end of the object * are deleted. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, boolean_t clean_only) { vm_page_t p, next; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (object->resident_page_count == 0) return; /* * Since physically-backed objects do not use managed pages, we can't * remove pages from the object (we must instead remove the page * references, and then destroy the object). */ KASSERT(object->type != OBJT_PHYS, ("attempt to remove pages from a physical object")); vm_object_pip_add(object, 1); again: vm_page_lock_queues(); if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < start) { p = vm_page_splay(start, object->root); if ((object->root = p)->pindex < start) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); if (p->wire_count != 0) { pmap_remove_all(p); if (!clean_only) p->valid = 0; continue; } if (vm_page_sleep_if_busy(p, TRUE, "vmopar")) goto again; if (clean_only && p->valid) { pmap_page_protect(p, VM_PROT_READ | VM_PROT_EXECUTE); if (p->valid & p->dirty) continue; } vm_page_busy(p); pmap_remove_all(p); vm_page_free(p); } vm_page_unlock_queues(); vm_object_pip_wakeup(object); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * next_object Second object into coalesce * next_offset Offset into next_object * * prev_size Size of reference to prev_object * next_size Size of reference to next_object * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, vm_size_t prev_size, vm_size_t next_size) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_LOCK(prev_object); if (prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) { VM_OBJECT_UNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_UNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = prev_pindex + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_UNLOCK(prev_object); return (FALSE); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, FALSE); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_UNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { struct vnode *vp; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); if (object->type == OBJT_VNODE && (vp = (struct vnode *)object->handle) != NULL) { VI_LOCK(vp); if ((vp->v_iflag & VI_OBJDIRTY) == 0) vp->v_iflag |= VI_OBJDIRTY; VI_UNLOCK(vp); } } #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ LIST_FOREACH(p, &allproc, p_list) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; if (_vm_object_in_map(kmem_map, object, 0)) return 1; if (_vm_object_in_map(pager_map, object, 0)) return 1; if (_vm_object_in_map(buffer_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; int nl = 0; int c; TAILQ_FOREACH(object, &vm_object_list, object_list) { vm_pindex_t idx, fidx; vm_pindex_t osize; vm_paddr_t pa = -1, padiff; int rcount; vm_page_t m; db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; osize = object->size; if (osize > 128) osize = 128; for (idx = 0; idx < osize; idx++) { m = vm_page_lookup(object, idx); if (m == NULL) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } continue; } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); padiff >>= PAGE_SHIFT; padiff &= PQ_L2_MASK; if (padiff == 0) { pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; ++rcount; continue; } db_printf(" index(%ld)run(%d)pa(0x%lx)", (long)fidx, rcount, (long)pa); db_printf("pd(%ld)\n", (long)padiff); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = idx; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_object.h =================================================================== --- head/sys/vm/vm_object.h (revision 127960) +++ head/sys/vm/vm_object.h (revision 127961) @@ -1,224 +1,220 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include enum obj_type { OBJT_DEFAULT, OBJT_SWAP, OBJT_VNODE, OBJT_DEVICE, OBJT_PHYS, OBJT_DEAD }; typedef u_char objtype_t; /* * Types defined: * * vm_object_t Virtual memory object. * * List of locks * (c) const until freed * */ struct vm_object { struct mtx mtx; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(, vm_page) memq; /* list of resident pages */ vm_page_t root; /* root of the resident page splay tree */ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ u_short paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; } devp; /* * Swap pager * * swp_bcount - number of swap 'swblock' metablocks, each * contains up to 16 swapblk assignments. * see vm/swap_pager.h */ struct { int swp_bcount; } swp; } un_pager; }; /* * Flags */ #define OBJ_ACTIVE 0x0004 /* active objects */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_WRITEABLE 0x0080 /* object has been made writable */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty */ #define OBJ_CLEANING 0x0200 #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if PG_NOSYNC */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; extern struct vm_object kmem_object_store; #define kernel_object (&kernel_object_store) #define kmem_object (&kmem_object_store) #define VM_OBJECT_LOCK(object) mtx_lock(&(object)->mtx) #define VM_OBJECT_LOCK_ASSERT(object, type) \ mtx_assert(&(object)->mtx, (type)) #define VM_OBJECT_LOCK_INIT(object) mtx_init(&(object)->mtx, "vm object", \ NULL, MTX_DEF | MTX_DUPOK) #define VM_OBJECT_LOCKED(object) mtx_owned(&(object)->mtx) #define VM_OBJECT_MTX(object) (&(object)->mtx) #define VM_OBJECT_TRYLOCK(object) mtx_trylock(&(object)->mtx) #define VM_OBJECT_UNLOCK(object) mtx_unlock(&(object)->mtx) /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); vm_object_t vm_object_allocate_wait (objtype_t, vm_pindex_t, int); void _vm_object_allocate (objtype_t, vm_pindex_t, vm_object_t); boolean_t vm_object_coalesce (vm_object_t, vm_pindex_t, vm_size_t, vm_size_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_vndeallocate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_page_clean (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t); void vm_object_page_remove (vm_object_t, vm_pindex_t, vm_pindex_t, boolean_t); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); void vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_madvise (vm_object_t, vm_pindex_t, int, int); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 127960) +++ head/sys/vm/vm_page.c (revision 127961) @@ -1,1789 +1,1785 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - a pageq mutex is required when adding or removing a page from a * page queue (vm_page_queue[]), regardless of other mutexes or the * busy state of a page. * * - a hash chain mutex is required when associating or disassociating * a page from the VM PAGE CACHE hash table (vm_page_buckets), * regardless of other mutexes or the busy state of a page. * * - either a hash chain mutex OR a busied page is required in order * to modify the page flags. A hash chain mutex must be obtained in * order to busy a page. A page's flags cannot be modified by a * hash chain mutex if the page is marked busy. * * - The object memq mutex is held when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). This * is different from the object's main mutex. * * Generally speaking, you have to be aware of side effects when running * vm_page ops. A vm_page_lookup() will return with the hash chain * locked, whether it was able to lookup the page or not. vm_page_free(), * vm_page_cache(), vm_page_activate(), and a number of other routines * will release the hash chain mutex for you. Intermediate manipulation * routines such as vm_page_flag_set() expect the hash chain to be held * on entry and the hash chain will remain held on return. * * pageq scanning can only occur with the pageq in question locked. * We have a known bottleneck with the active queue, but the cache * and free queues are actually arrays already. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct mtx vm_page_queue_mtx; struct mtx vm_page_queue_free_mtx; vm_page_t vm_page_array = 0; int vm_page_array_size = 0; long first_page = 0; int vm_page_zero_count = 0; /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (cnt.v_page_size == 0) cnt.v_page_size = PAGE_SIZE; if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_size_t npages; vm_paddr_t page_range; vm_paddr_t new_end; int i; vm_paddr_t pa; int nblocks; vm_paddr_t last_pa; /* the biggest memory array is the second group of pages */ vm_paddr_t end; vm_paddr_t biggestsize; int biggestone; vm_paddr_t total; vm_size_t bootpages; total = 0; biggestsize = 0; biggestone = 0; nblocks = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } ++nblocks; total += size; } end = phys_avail[biggestone+1]; /* * Initialize the locks. */ mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF); mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, MTX_SPIN); /* * Initialize the queue headers for the free queue, the active queue * and the inactive queue. */ vm_pageq_init(); /* * Allocate memory for use when boot strapping the kernel memory * allocator. */ bootpages = UMA_BOOT_PAGES * UMA_SLAB_SIZE; new_end = end - bootpages; new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((caddr_t) mapped, end - new_end); uma_startup((caddr_t)mapped); /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = phys_avail[0] / PAGE_SIZE; page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page; npages = (total - (page_range * sizeof(struct vm_page)) - (end - new_end)) / PAGE_SIZE; end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. */ vaddr += PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; phys_avail[biggestone + 1] = new_end; /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); vm_page_array_size = page_range; /* * Construct the free queue(s) in descending order (by physical * address) so that the first 16MB of physical memory is allocated * last rather than first. On large-memory machines, this avoids * the exhaustion of low physical memory before isa_dmainit has run. */ cnt.v_page_count = 0; cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa && npages-- > 0) { vm_pageq_add_new_page(pa); pa += PAGE_SIZE; } } return (vaddr); } void vm_page_flag_set(vm_page_t m, unsigned short bits) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->flags |= bits; } void vm_page_flag_clear(vm_page_t m, unsigned short bits) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->flags &= ~bits; } void vm_page_busy(vm_page_t m) { KASSERT((m->flags & PG_BUSY) == 0, ("vm_page_busy: page already busy!!!")); vm_page_flag_set(m, PG_BUSY); } /* * vm_page_flash: * * wakeup anyone waiting for the page. */ void vm_page_flash(vm_page_t m) { if (m->flags & PG_WANTED) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } } /* * vm_page_wakeup: * * clear the PG_BUSY flag and wakeup anyone waiting for the * page. * */ void vm_page_wakeup(vm_page_t m) { KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!")); vm_page_flag_clear(m, PG_BUSY); vm_page_flash(m); } void vm_page_io_start(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->busy++; } void vm_page_io_finish(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->busy--; if (m->busy == 0) vm_page_flash(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); --mem->hold_count; KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); if (mem->hold_count == 0 && mem->queue == PQ_HOLD) vm_page_free_toq(mem); } /* * vm_page_free: * * Free a page * * The clearing of PG_ZERO is a temporary safety until the code can be * reviewed to determine that PG_ZERO is being properly cleared on * write faults or maps. PG_ZERO was previously cleared in * vm_page_alloc(). */ void vm_page_free(vm_page_t m) { vm_page_flag_clear(m, PG_ZERO); vm_page_free_toq(m); vm_page_zero_idle_wakeup(); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { vm_page_flag_set(m, PG_ZERO); vm_page_free_toq(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if PG_BUSY is set or, * if also_m_busy is TRUE, busy is non-zero. Returns TRUE if the * thread slept and the page queues lock was released. * Otherwise, retains the page queues lock and returns FALSE. */ int vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg) { int is_object_locked; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) { vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); /* * Remove mtx_owned() after vm_object locking is finished. */ if ((is_object_locked = m->object != NULL && mtx_owned(&m->object->mtx))) mtx_unlock(&m->object->mtx); msleep(m, &vm_page_queue_mtx, PDROP | PVM, msg, 0); if (is_object_locked) mtx_lock(&m->object->mtx); return (TRUE); } return (FALSE); } /* * vm_page_dirty: * * make page all dirty */ void vm_page_dirty(vm_page_t m) { KASSERT(m->queue - m->pc != PQ_CACHE, ("vm_page_dirty: page in cache!")); KASSERT(m->queue - m->pc != PQ_FREE, ("vm_page_dirty: page is free!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_splay: * * Implements Sleator and Tarjan's top-down splay algorithm. Returns * the vm_page containing the given pindex. If, however, that * pindex is not found in the vm_object, returns a vm_page that is * adjacent to the pindex, coming before or after it. */ vm_page_t vm_page_splay(vm_pindex_t pindex, vm_page_t root) { struct vm_page dummy; vm_page_t lefttreemax, righttreemin, y; if (root == NULL) return (root); lefttreemax = righttreemin = &dummy; for (;; root = y) { if (pindex < root->pindex) { if ((y = root->left) == NULL) break; if (pindex < y->pindex) { /* Rotate right. */ root->left = y->right; y->right = root; root = y; if ((y = root->left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->left = root; righttreemin = root; } else if (pindex > root->pindex) { if ((y = root->right) == NULL) break; if (pindex > y->pindex) { /* Rotate left. */ root->right = y->left; y->left = root; root = y; if ((y = root->right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->right = root; lefttreemax = root; } else break; } /* Assemble the new root. */ lefttreemax->right = root->left; righttreemin->left = root->right; root->left = dummy.right; root->right = dummy.left; return (root); } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The pagetables are not updated but will presumably fault the page * in if necessary, or if a kernel page the caller will at some point * enter the page into the kernel's pmap. We are not allowed to block * here so we *can't* do this anyway. * * The object and page must be locked, and must be splhigh. * This routine may not block. */ void vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t root; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (m->object != NULL) panic("vm_page_insert: page already inserted"); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ root = object->root; if (root == NULL) { m->left = NULL; m->right = NULL; TAILQ_INSERT_TAIL(&object->memq, m, listq); } else { root = vm_page_splay(pindex, root); if (pindex < root->pindex) { m->left = root->left; m->right = root; root->left = NULL; TAILQ_INSERT_BEFORE(root, m, listq); } else if (pindex == root->pindex) panic("vm_page_insert: offset already allocated"); else { m->right = root->right; m->left = root; root->right = NULL; TAILQ_INSERT_AFTER(&object->memq, root, m, listq); } } object->root = m; object->generation++; /* * show that the object has one more resident page. */ object->resident_page_count++; /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags. */ if (m->flags & PG_WRITEABLE) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * NOTE: used by device pager as well -wfj * * Removes the given mem entry from the object/offset-page * table and the object page list, but do not invalidate/terminate * the backing store. * * The object and page must be locked, and at splhigh. * The underlying pmap entry (if any) is NOT removed here. * This routine may not block. */ void vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t root; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->object == NULL) return; #ifndef __alpha__ VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); #endif if ((m->flags & PG_BUSY) == 0) { panic("vm_page_remove: page not busy"); } /* * Basically destroy the page. */ vm_page_wakeup(m); object = m->object; /* * Now remove from the object's list of backed pages. */ if (m != object->root) vm_page_splay(m->pindex, object->root); if (m->left == NULL) root = m->right; else { root = vm_page_splay(m->pindex, m->left); root->right = m->right; } object->root = root; TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; object->generation++; m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. * This routine may not block. * This is a critical path routine */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if ((m = object->root) != NULL && m->pindex != pindex) { m = vm_page_splay(pindex, m); if ((object->root = m)->pindex != pindex) m = NULL; } return (m); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * The object must be locked. * This routine may not block. * * Note: this routine will raise itself to splvm(), the caller need not. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. If the page is on the cache, we have to deactivate it * or vm_page_dirty() will panic. Dirty pages are not allowed * on the cache. */ void vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { int s; s = splvm(); vm_page_remove(m); vm_page_insert(m, new_object, new_pindex); if (m->queue - m->pc == PQ_CACHE) vm_page_deactivate(m); vm_page_dirty(m); splx(s); } /* * vm_page_select_cache: * * Find a page on the cache queue with color optimization. As pages * might be found, but not applicable, they are deactivated. This * keeps us from using potentially busy cached pages. * * This routine must be called at splvm(). * This routine may not block. */ vm_page_t vm_page_select_cache(int color) { vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); while (TRUE) { m = vm_pageq_find(PQ_CACHE, color, FALSE); if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->hold_count || m->wire_count || (!VM_OBJECT_TRYLOCK(m->object) && !VM_OBJECT_LOCKED(m->object)))) { vm_page_deactivate(m); continue; } return m; } } /* * vm_page_alloc: * * Allocate and return a memory cell associated * with this VM object/offset pair. * * page_req classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * VM_ALLOC_ZERO zero page * * This routine may not block. * * Additional special handling is required when called from an * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with * the page cache in this case. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { vm_object_t m_object; vm_page_t m = NULL; int color, flags, page_req, s; page_req = req & VM_ALLOC_CLASS_MASK; if ((req & VM_ALLOC_NOOBJ) == 0) { KASSERT(object != NULL, ("vm_page_alloc: NULL object.")); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); color = (pindex + object->pg_color) & PQ_L2_MASK; } else color = pindex & PQ_L2_MASK; /* * The pager is allowed to eat deeper into the free page list. */ if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { page_req = VM_ALLOC_SYSTEM; }; s = splvm(); loop: mtx_lock_spin(&vm_page_queue_free_mtx); if (cnt.v_free_count > cnt.v_free_reserved || (page_req == VM_ALLOC_SYSTEM && cnt.v_cache_count == 0 && cnt.v_free_count > cnt.v_interrupt_free_min) || (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) { /* * Allocate from the free queue if the number of free pages * exceeds the minimum for the request class. */ m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0); } else if (page_req != VM_ALLOC_INTERRUPT) { mtx_unlock_spin(&vm_page_queue_free_mtx); /* * Allocatable from cache (non-interrupt only). On success, * we must free the page and try again, thus ensuring that * cnt.v_*_free_min counters are replenished. */ vm_page_lock_queues(); if ((m = vm_page_select_cache(color)) == NULL) { vm_page_unlock_queues(); splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif atomic_add_int(&vm_pageout_deficit, 1); pagedaemon_wakeup(); return (NULL); } KASSERT(m->dirty == 0, ("Found dirty cache page %p", m)); m_object = m->object; VM_OBJECT_LOCK_ASSERT(m_object, MA_OWNED); vm_page_busy(m); pmap_remove_all(m); vm_page_free(m); vm_page_unlock_queues(); if (m_object != object) VM_OBJECT_UNLOCK(m_object); goto loop; } else { /* * Not allocatable from cache from interrupt, give up. */ mtx_unlock_spin(&vm_page_queue_free_mtx); splx(s); atomic_add_int(&vm_pageout_deficit, 1); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT( m != NULL, ("vm_page_alloc(): missing page on free queue\n") ); /* * Remove from free queue */ vm_pageq_remove_nowakeup(m); /* * Initialize structure. Only the PG_ZERO flag is inherited. */ flags = PG_BUSY; if (m->flags & PG_ZERO) { vm_page_zero_count--; if (req & VM_ALLOC_ZERO) flags = PG_ZERO | PG_BUSY; } m->flags = flags; if (req & VM_ALLOC_WIRED) { atomic_add_int(&cnt.v_wire_count, 1); m->wire_count = 1; } else m->wire_count = 0; m->hold_count = 0; m->act_count = 0; m->busy = 0; m->valid = 0; KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m)); mtx_unlock_spin(&vm_page_queue_free_mtx); /* * vm_page_insert() is safe prior to the splx(). Note also that * inserting a page here does not insert it into the pmap (which * could cause us to block allocating memory). We cannot block * anywhere. */ if ((req & VM_ALLOC_NOOBJ) == 0) vm_page_insert(m, object, pindex); else m->pindex = pindex; /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); splx(s); return (m); } /* * vm_wait: (also see VM_WAIT macro) * * Block until free pages are available for allocation * - Called in various places before memory allocations. */ void vm_wait(void) { int s; s = splvm(); vm_page_lock_queues(); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_mtx, PDROP | PSWP, "VMWait", 0); } else { if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PVM, "vmwait", 0); } splx(s); } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Block until free pages are available for allocation * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { int s; s = splvm(); vm_page_lock_queues(); if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PUSER, "pfault", 0); splx(s); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page queues must be locked. * This routine may not block. */ void vm_page_activate(vm_page_t m) { int s; mtx_assert(&vm_page_queue_mtx, MA_OWNED); s = splvm(); if (m->queue != PQ_ACTIVE) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_pageq_remove(m); if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; vm_pageq_enqueue(PQ_ACTIVE, m); } } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } splx(s); } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq() and vm_page_cache(). This * routine is called when a page has been added to the cache or free * queues. * * This routine may not block. * This routine must be called at splvm() */ static __inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = 0; wakeup(&cnt.v_free_count); } } /* * vm_page_free_toq: * * Returns the given page to the PQ_FREE list, * disassociating it with any VM object. * * Object and page must be locked prior to entry. * This routine may not block. */ void vm_page_free_toq(vm_page_t m) { int s; struct vpgqueues *pq; vm_object_t object = m->object; mtx_assert(&vm_page_queue_mtx, MA_OWNED); s = splvm(); cnt.v_tfree++; if (m->busy || ((m->queue - m->pc) == PQ_FREE)) { printf( "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n", (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0, m->hold_count); if ((m->queue - m->pc) == PQ_FREE) panic("vm_page_free: freeing free page"); else panic("vm_page_free: freeing busy page"); } /* * unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_pageq_remove_nowakeup(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { splx(s); return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) { if (m->wire_count > 1) { panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx", m->wire_count, (long)m->pindex); } panic("vm_page_free: freeing wired page\n"); } /* * If we've exhausted the object's resident pages we want to free * it up. */ if (object && (object->type == OBJT_VNODE) && ((object->flags & OBJ_DEAD) == 0) ) { struct vnode *vp = (struct vnode *)object->handle; if (vp) { VI_LOCK(vp); if (VSHOULDFREE(vp)) vfree(vp); VI_UNLOCK(vp); } } /* * Clear the UNMANAGED flag when freeing an unmanaged page. */ if (m->flags & PG_UNMANAGED) { m->flags &= ~PG_UNMANAGED; } if (m->hold_count != 0) { m->flags &= ~PG_ZERO; m->queue = PQ_HOLD; } else m->queue = PQ_FREE + m->pc; pq = &vm_page_queues[m->queue]; mtx_lock_spin(&vm_page_queue_free_mtx); pq->lcnt++; ++(*pq->cnt); /* * Put zero'd pages on the end ( where we look for zero'd pages * first ) and non-zerod pages at the head. */ if (m->flags & PG_ZERO) { TAILQ_INSERT_TAIL(&pq->pl, m, pageq); ++vm_page_zero_count; } else { TAILQ_INSERT_HEAD(&pq->pl, m, pageq); } mtx_unlock_spin(&vm_page_queue_free_mtx); vm_page_free_wakeup(); splx(s); } /* * vm_page_unmanage: * * Prevent PV management from being done on the page. The page is * removed from the paging queues as if it were wired, and as a * consequence of no longer being managed the pageout daemon will not * touch it (since there is no way to locate the pte mappings for the * page). madvise() calls that mess with the pmap will also no longer * operate on the page. * * Beyond that the page is still reasonably 'normal'. Freeing the page * will clear the flag. * * This routine is used by OBJT_PHYS objects - objects using unswappable * physical memory as backing store rather then swap-backed memory and * will eventually be extended to support 4MB unmanaged physical * mappings. */ void vm_page_unmanage(vm_page_t m) { int s; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_UNMANAGED) == 0) { if (m->wire_count == 0) vm_pageq_remove(m); } vm_page_flag_set(m, PG_UNMANAGED); splx(s); } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * The page queues must be locked. * This routine may not block. */ void vm_page_wire(vm_page_t m) { int s; /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->wire_count == 0) { if ((m->flags & PG_UNMANAGED) == 0) vm_pageq_remove(m); atomic_add_int(&cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); splx(s); } /* * vm_page_unwire: * * Release one wiring of this page, potentially * enabling it to be paged again. * * Many pages placed on the inactive queue should actually go * into the cache, but it is difficult to figure out which. What * we do instead, if the inactive target is well met, is to put * clean pages at the head of the inactive queue instead of the tail. * This will cause them to be moved to the cache more quickly and * if not actively re-referenced, freed more quickly. If we just * stick these pages at the end of the inactive queue, heavy filesystem * meta-data accesses can cause an unnecessary paging load on memory bound * processes. This optimization causes one-time-use metadata to be * reused more quickly. * * BUT, if we are in a low-memory situation we have no choice but to * put clean pages on the cache queue. * * A number of routines use vm_page_unwire() to guarantee that the page * will go into either the inactive or active queues, and will NEVER * be placed in the cache - for example, just after dirtying a page. * dirty pages in the cache are not allowed. * * The page queues must be locked. * This routine may not block. */ void vm_page_unwire(vm_page_t m, int activate) { int s; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&cnt.v_wire_count, 1); if (m->flags & PG_UNMANAGED) { ; } else if (activate) vm_pageq_enqueue(PQ_ACTIVE, m); else { vm_page_flag_clear(m, PG_WINATCFLS); vm_pageq_enqueue(PQ_INACTIVE, m); } } } else { panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count); } splx(s); } /* * Move the specified page to the inactive queue. If the page has * any associated swap, the swap is deallocated. * * Normally athead is 0 resulting in LRU operation. athead is set * to 1 if we want this page to be 'as if it were placed in the cache', * except without unmapping it from the process address space. * * This routine may not block. */ static __inline void _vm_page_deactivate(vm_page_t m, int athead) { int s; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Ignore if already inactive. */ if (m->queue == PQ_INACTIVE) return; s = splvm(); if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_page_flag_clear(m, PG_WINATCFLS); vm_pageq_remove(m); if (athead) TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); else TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); m->queue = PQ_INACTIVE; vm_page_queues[PQ_INACTIVE].lcnt++; cnt.v_inactive_count++; } splx(s); } void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, 0); } /* * vm_page_try_to_cache: * * Returns 0 on failure, 1 on success */ int vm_page_try_to_cache(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->dirty || m->hold_count || m->busy || m->wire_count || (m->flags & (PG_BUSY|PG_UNMANAGED))) { return (0); } pmap_remove_all(m); if (m->dirty) return (0); vm_page_cache(m); return (1); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->object != NULL) VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (m->dirty || m->hold_count || m->busy || m->wire_count || (m->flags & (PG_BUSY|PG_UNMANAGED))) { return (0); } pmap_remove_all(m); if (m->dirty) return (0); vm_page_busy(m); vm_page_free(m); return (1); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). * * This routine may not block. */ void vm_page_cache(vm_page_t m) { int s; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->hold_count || m->wire_count) { printf("vm_page_cache: attempting to cache busy page\n"); return; } if ((m->queue - m->pc) == PQ_CACHE) return; /* * Remove all pmaps and indicate that the page is not * writeable or mapped. */ pmap_remove_all(m); if (m->dirty != 0) { panic("vm_page_cache: caching a dirty page, pindex: %ld", (long)m->pindex); } s = splvm(); vm_pageq_remove_nowakeup(m); vm_pageq_enqueue(PQ_CACHE + m->pc, m); vm_page_free_wakeup(); splx(s); } /* * vm_page_dontneed * * Cache, deactivate, or do nothing as appropriate. This routine * is typically used by madvise() MADV_DONTNEED. * * Generally speaking we want to move the page into the cache so * it gets reused quickly. However, this can result in a silly syndrome * due to the page recycling too quickly. Small objects will not be * fully cached. On the otherhand, if we move the page to the inactive * queue we wind up with a problem whereby very large objects * unnecessarily blow away our inactive and cache queues. * * The solution is to move the pages based on a fixed weighting. We * either leave them alone, deactivate them, or move them to the cache, * where moving them to the cache has the highest weighting. * By forcing some pages into other queues we eventually force the * system to balance the queues, potentially recovering other unrelated * space from active. The idea is to not force this to happen too * often. */ void vm_page_dontneed(vm_page_t m) { static int dnweight; int dnw; int head; mtx_assert(&vm_page_queue_mtx, MA_OWNED); dnw = ++dnweight; /* * occassionally leave the page alone */ if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE || m->queue - m->pc == PQ_CACHE ) { if (m->act_count >= ACT_INIT) --m->act_count; return; } if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty || (dnw & 0x0070) == 0) { /* * Deactivate the page 3 times out of 32. */ head = 0; } else { /* * Cache the page 28 times out of every 32. Note that * the page is deactivated instead of cached, but placed * at the head of the queue instead of the tail. */ head = 1; } _vm_page_deactivate(m, head); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, allocate it. * * This routine may block. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { vm_page_lock_queues(); if (m->busy || (m->flags & PG_BUSY)) { vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); VM_OBJECT_UNLOCK(object); msleep(m, &vm_page_queue_mtx, PDROP | PVM, "pgrbwt", 0); VM_OBJECT_LOCK(object); if ((allocflags & VM_ALLOC_RETRY) == 0) return (NULL); goto retrylookup; } else { if (allocflags & VM_ALLOC_WIRED) vm_page_wire(m); vm_page_busy(m); vm_page_unlock_queues(); return m; } } m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY); if (m == NULL) { VM_OBJECT_UNLOCK(object); VM_WAIT; VM_OBJECT_LOCK(object); if ((allocflags & VM_ALLOC_RETRY) == 0) return NULL; goto retrylookup; } return m; } /* * Mapping function for valid bits or for dirty bits in * a page. May not block. * * Inputs are required to range within a page. */ __inline int vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return ((2 << last_bit) - (1 << first_bit)); } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * This routine may not block. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { int pagebits; int frag; int endoff; mtx_assert(&vm_page_queue_mtx, MA_OWNED); VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = base & ~(DEV_BSIZE - 1)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the PG_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif m->dirty &= ~pagebits; if (base == 0 && size == PAGE_SIZE) { pmap_clear_modify(m); vm_page_flag_clear(m, PG_NOSYNC); } } void vm_page_clear_dirty(vm_page_t m, int base, int size) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->dirty &= ~vm_page_bits(base, size); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. * * May not block. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { int bits; VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); bits = vm_page_bits(base, size); mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->valid &= ~bits; m->dirty &= ~bits; m->object->generation++; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); /* * Scan the valid bits looking for invalid sections that * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zerod by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & (1 << i)) ) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. * * May not block. */ int vm_page_is_valid(vm_page_t m, int base, int size) { int bits = vm_page_bits(base, size); VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (m->valid && ((m->valid & bits) == bits)) return 1; else return 0; } /* * update dirty bits from pmap/mmu. May not block. */ void vm_page_test_dirty(vm_page_t m) { if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) { vm_page_dirty(m); } } int so_zerocp_fullpage = 0; void vm_page_cowfault(vm_page_t m) { vm_page_t mnew; vm_object_t object; vm_pindex_t pindex; object = m->object; pindex = m->pindex; vm_page_busy(m); retry_alloc: vm_page_remove(m); /* * An interrupt allocation is requested because the page * queues lock is held. */ mnew = vm_page_alloc(object, pindex, VM_ALLOC_INTERRUPT); if (mnew == NULL) { vm_page_insert(m, object, pindex); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); VM_WAIT; VM_OBJECT_LOCK(object); vm_page_lock_queues(); goto retry_alloc; } if (m->cow == 0) { /* * check to see if we raced with an xmit complete when * waiting to allocate a page. If so, put things back * the way they were */ vm_page_busy(mnew); vm_page_free(mnew); vm_page_insert(m, object, pindex); } else { /* clear COW & copy page */ if (!so_zerocp_fullpage) pmap_copy_page(m, mnew); mnew->valid = VM_PAGE_BITS_ALL; vm_page_dirty(mnew); vm_page_flag_clear(mnew, PG_BUSY); } } void vm_page_cowclear(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->cow) { m->cow--; /* * let vm_fault add back write permission lazily */ } /* * sf_buf_free() will free the page, so we needn't do it here */ } void vm_page_cowsetup(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->cow++; pmap_page_protect(m, VM_PROT_READ); } #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int i; db_printf("PQ_FREE:"); for (i = 0; i < PQ_L2_SIZE; i++) { db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt); } db_printf("\n"); db_printf("PQ_CACHE:"); for (i = 0; i < PQ_L2_SIZE; i++) { db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt); } db_printf("\n"); db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", vm_page_queues[PQ_ACTIVE].lcnt, vm_page_queues[PQ_INACTIVE].lcnt); } #endif /* DDB */ Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h (revision 127960) +++ head/sys/vm/vm_page.h (revision 127961) @@ -1,400 +1,396 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #if !defined(KLD_MODULE) #include "opt_vmpage.h" #endif #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several lists: * * A hash table bucket used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * Fields in this structure are locked either by the lock on the * object that the page belongs to (O) or by the lock on the page * queues (P). * * The 'valid' and 'dirty' fields are distinct. A page may have dirty * bits set without having associated valid bits set. This is used by * NFS to implement piecemeal writes. */ TAILQ_HEAD(pglist, vm_page); struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO queue or free list (P) */ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ struct vm_page *left; /* splay tree link (O) */ struct vm_page *right; /* splay tree link (O) */ vm_object_t object; /* which object am I in (O,P)*/ vm_pindex_t pindex; /* offset into object (O,P) */ vm_paddr_t phys_addr; /* physical address of page */ struct md_page md; /* machine dependant stuff */ u_short queue; /* page queue index */ u_short flags, /* see below */ pc; /* page color */ u_short wire_count; /* wired down maps refs (P) */ short hold_count; /* page hold count */ u_char act_count; /* page usage count */ u_char busy; /* page busy count */ /* NOTE that these must support one bit per DEV_BSIZE in a page!!! */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ #if PAGE_SIZE == 4096 u_char valid; /* map of valid DEV_BSIZE chunks (O) */ u_char dirty; /* map of dirty DEV_BSIZE chunks */ #elif PAGE_SIZE == 8192 u_short valid; /* map of valid DEV_BSIZE chunks (O) */ u_short dirty; /* map of dirty DEV_BSIZE chunks */ #elif PAGE_SIZE == 16384 u_int valid; /* map of valid DEV_BSIZE chunks (O) */ u_int dirty; /* map of dirty DEV_BSIZE chunks */ #elif PAGE_SIZE == 32768 u_long valid; /* map of valid DEV_BSIZE chunks (O) */ u_long dirty; /* map of dirty DEV_BSIZE chunks */ #endif u_int cow; /* page cow mapping count */ }; /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * note: currently use SWAPBLK_NONE as an absolute value rather then * a flag bit. */ #define SWAPBLK_MASK ((daddr_t)((u_daddr_t)-1 >> 1)) /* mask */ #define SWAPBLK_NONE ((daddr_t)((u_daddr_t)SWAPBLK_MASK + 1))/* flag */ #if !defined(KLD_MODULE) /* * Page coloring parameters */ /* Each of PQ_FREE, and PQ_CACHE have PQ_HASH_SIZE entries */ /* Backward compatibility for existing PQ_*CACHE config options. */ #if !defined(PQ_CACHESIZE) #if defined(PQ_HUGECACHE) #define PQ_CACHESIZE 1024 #elif defined(PQ_LARGECACHE) #define PQ_CACHESIZE 512 #elif defined(PQ_MEDIUMCACHE) #define PQ_CACHESIZE 256 #elif defined(PQ_NORMALCACHE) #define PQ_CACHESIZE 64 #elif defined(PQ_NOOPT) #define PQ_CACHESIZE 0 #else #define PQ_CACHESIZE 128 #endif #endif /* !defined(PQ_CACHESIZE) */ #if PQ_CACHESIZE >= 1024 #define PQ_PRIME1 31 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 23 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 256 /* A number of colors opt for 1M cache */ #elif PQ_CACHESIZE >= 512 #define PQ_PRIME1 31 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 23 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 128 /* A number of colors opt for 512K cache */ #elif PQ_CACHESIZE >= 256 #define PQ_PRIME1 13 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 7 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 64 /* A number of colors opt for 256K cache */ #elif PQ_CACHESIZE >= 128 #define PQ_PRIME1 9 /* Produces a good PQ_L2_SIZE/3 + PQ_PRIME1 */ #define PQ_PRIME2 5 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 32 /* A number of colors opt for 128k cache */ #elif PQ_CACHESIZE >= 64 #define PQ_PRIME1 5 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 3 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 16 /* A reasonable number of colors (opt for 64K cache) */ #else #define PQ_PRIME1 1 /* Disable page coloring. */ #define PQ_PRIME2 1 #define PQ_L2_SIZE 1 #endif #define PQ_L2_MASK (PQ_L2_SIZE - 1) #define PQ_NONE 0 #define PQ_FREE 1 #define PQ_INACTIVE (1 + 1*PQ_L2_SIZE) #define PQ_ACTIVE (2 + 1*PQ_L2_SIZE) #define PQ_CACHE (3 + 1*PQ_L2_SIZE) #define PQ_HOLD (3 + 2*PQ_L2_SIZE) #define PQ_COUNT (4 + 2*PQ_L2_SIZE) struct vpgqueues { struct pglist pl; int *cnt; int lcnt; }; extern struct vpgqueues vm_page_queues[PQ_COUNT]; extern struct mtx vm_page_queue_free_mtx; #endif /* !defined(KLD_MODULE) */ /* * These are the flags defined for vm_page. * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. * * Note: PG_UNMANAGED (used by OBJT_PHYS) indicates that the page is * not under PV management but otherwise should be treated as a * normal page. Pages not under PV management cannot be paged out * via the object/vm_page_t because there is no knowledge of their * pte mappings, nor can they be removed from their objects via * the object, and such pages are also not on any PQ queue. */ #define PG_BUSY 0x0001 /* page is in transit (O) */ #define PG_WANTED 0x0002 /* someone is waiting for page (O) */ #define PG_WINATCFLS 0x0004 /* flush dirty page on inactive q */ #define PG_FICTITIOUS 0x0008 /* physical page doesn't exist (O) */ #define PG_WRITEABLE 0x0010 /* page is mapped writeable */ #define PG_ZERO 0x0040 /* page is zeroed */ #define PG_REFERENCED 0x0080 /* page has been referenced */ #define PG_CLEANCHK 0x0100 /* page will be checked for cleaning */ #define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */ #define PG_NOSYNC 0x0400 /* do not collect for syncer */ #define PG_UNMANAGED 0x0800 /* No PV management for page */ #define PG_MARKER 0x1000 /* special queue marker page */ #define PG_SLAB 0x2000 /* object pointer is actually a slab */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #ifdef _KERNEL /* * Each pageable resident page falls into one of four lists: * * free * Available for allocation now. * * The following are all LRU sorted: * * cache * Almost available for allocation. Still in an * object, but clean and immediately freeable at * non-interrupt times. * * inactive * Low activity, candidates for reclamation. * This is the list of pages that should be * paged out next. * * active * Pages that are "active" i.e. they have been * recently referenced. * * zero * Pages that are really free and have been pre-zeroed * */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern int vm_page_array_size; /* number of vm_page_t's */ extern long first_page; /* first physical page number */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) #define PHYS_TO_VM_PAGE(pa) \ (&vm_page_array[atop(pa) - first_page ]) extern struct mtx vm_page_queue_mtx; #define vm_page_lock_queues() mtx_lock(&vm_page_queue_mtx) #define vm_page_unlock_queues() mtx_unlock(&vm_page_queue_mtx) #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xffu #elif PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffffu #elif PAGE_SIZE == 16384 #define VM_PAGE_BITS_ALL 0xffffffffu #elif PAGE_SIZE == 32768 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu #endif /* page allocation classes: */ #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_CLASS_MASK 3 /* page allocation flags: */ #define VM_ALLOC_WIRED 0x0020 /* non pageable */ #define VM_ALLOC_ZERO 0x0040 /* Try to obtain a zeroed page */ #define VM_ALLOC_RETRY 0x0080 /* vm_page_grab() only */ #define VM_ALLOC_NOOBJ 0x0100 /* No associated object */ void vm_page_flag_set(vm_page_t m, unsigned short bits); void vm_page_flag_clear(vm_page_t m, unsigned short bits); void vm_page_busy(vm_page_t m); void vm_page_flash(vm_page_t m); void vm_page_io_start(vm_page_t m); void vm_page_io_finish(vm_page_t m); void vm_page_hold(vm_page_t mem); void vm_page_unhold(vm_page_t mem); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); int vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg); void vm_page_dirty(vm_page_t m); void vm_page_wakeup(vm_page_t m); void vm_pageq_init(void); vm_page_t vm_pageq_add_new_page(vm_paddr_t pa); void vm_pageq_enqueue(int queue, vm_page_t m); void vm_pageq_remove_nowakeup(vm_page_t m); void vm_pageq_remove(vm_page_t m); vm_page_t vm_pageq_find(int basequeue, int index, boolean_t prefer_zero); void vm_pageq_requeue(vm_page_t m); void vm_page_activate (vm_page_t); vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); void vm_page_cache (register vm_page_t); int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_dontneed (register vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); void vm_page_remove (vm_page_t); void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t); vm_page_t vm_page_select_cache(int); vm_page_t vm_page_splay(vm_pindex_t, vm_page_t); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_unmanage (vm_page_t); void vm_page_unwire (vm_page_t, int); void vm_page_wire (vm_page_t); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int); int vm_page_is_valid (vm_page_t, int, int); void vm_page_test_dirty (vm_page_t); int vm_page_bits (int, int); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); void vm_page_zero_idle_wakeup(void); void vm_page_cowfault (vm_page_t); void vm_page_cowsetup (vm_page_t); void vm_page_cowclear (vm_page_t); /* * vm_page_undirty: * * Set page to not be dirty. Note: does not clear pmap modify bits */ static __inline void vm_page_undirty(vm_page_t m) { m->dirty = 0; } #endif /* _KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_pageout.h =================================================================== --- head/sys/vm/vm_pageout.h (revision 127960) +++ head/sys/vm/vm_pageout.h (revision 127961) @@ -1,113 +1,109 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #ifndef _VM_VM_PAGEOUT_H_ #define _VM_VM_PAGEOUT_H_ /* * Header file for pageout daemon. */ /* * Exported data structures. */ extern int vm_page_max_wired; extern int vm_pages_needed; /* should be some "event" structure */ extern int vm_pageout_pages_needed; extern int vm_pageout_deficit; extern int vm_pageout_page_count; /* * Swap out requests */ #define VM_SWAP_NORMAL 1 #define VM_SWAP_IDLE 2 /* * Exported routines. */ /* * Signal pageout-daemon and wait for it. */ extern void pagedaemon_wakeup(void); #define VM_WAIT vm_wait() #define VM_WAITPFAULT vm_waitpfault() extern void vm_wait(void); extern void vm_waitpfault(void); /* XXX This is probably misplaced. */ #ifndef NO_SWAPPING struct swdevt; void vm_proc_swapin_all(struct swdevt *); #endif /* !NO_SWAPPING */ #ifdef _KERNEL int vm_pageout_flush(vm_page_t *, int, int); #endif #endif /* _VM_VM_PAGEOUT_H_ */ Index: head/sys/vm/vm_pager.c =================================================================== --- head/sys/vm/vm_pager.c (revision 127960) +++ head/sys/vm/vm_pager.c (revision 127961) @@ -1,434 +1,430 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data"); int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t); static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t dead_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dead_pager_dealloc(vm_object_t); static int dead_pager_getpages(obj, ma, count, req) vm_object_t obj; vm_page_t *ma; int count; int req; { return VM_PAGER_FAIL; } static vm_object_t dead_pager_alloc(handle, size, prot, off) void *handle; vm_ooffset_t size; vm_prot_t prot; vm_ooffset_t off; { return NULL; } static void dead_pager_putpages(object, m, count, flags, rtvals) vm_object_t object; vm_page_t *m; int count; int flags; int *rtvals; { int i; for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_AGAIN; } } static int dead_pager_haspage(object, pindex, prev, next) vm_object_t object; vm_pindex_t pindex; int *prev; int *next; { if (prev) *prev = 0; if (next) *next = 0; return FALSE; } static void dead_pager_dealloc(object) vm_object_t object; { return; } static struct pagerops deadpagerops = { .pgo_alloc = dead_pager_alloc, .pgo_dealloc = dead_pager_dealloc, .pgo_getpages = dead_pager_getpages, .pgo_putpages = dead_pager_putpages, .pgo_haspage = dead_pager_haspage, }; struct pagerops *pagertab[] = { &defaultpagerops, /* OBJT_DEFAULT */ &swappagerops, /* OBJT_SWAP */ &vnodepagerops, /* OBJT_VNODE */ &devicepagerops, /* OBJT_DEVICE */ &physpagerops, /* OBJT_PHYS */ &deadpagerops /* OBJT_DEAD */ }; int npagers = sizeof(pagertab) / sizeof(pagertab[0]); /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. * * XXX needs to be large enough to support the number of pending async * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ #define PAGER_MAP_SIZE (8 * 1024 * 1024) int pager_map_size = PAGER_MAP_SIZE; vm_map_t pager_map; static int bswneeded; static vm_offset_t swapbkva; /* swap buffers kva */ struct mtx pbuf_mtx; static TAILQ_HEAD(swqueue, buf) bswlist; void vm_pager_init() { struct pagerops **pgops; TAILQ_INIT(&bswlist); /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops && ((*pgops)->pgo_init != NULL)) (*(*pgops)->pgo_init) (); } void vm_pager_bufferinit() { struct buf *bp; int i; mtx_init(&pbuf_mtx, "pbuf mutex", NULL, MTX_DEF); bp = swbuf; /* * Now set up swap and physical I/O buffer headers. */ for (i = 0; i < nswbuf; i++, bp++) { TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; bp->b_xflags = 0; } cluster_pbuf_freecnt = nswbuf / 2; swapbkva = kmem_alloc_nofault(pager_map, nswbuf * MAXPHYS); if (!swapbkva) panic("Not enough pager_map VM space for physical buffers"); } /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_object_t vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off) { vm_object_t ret; struct pagerops *ops; ops = pagertab[type]; if (ops) ret = (*ops->pgo_alloc) (handle, size, prot, off); else ret = NULL; return (ret); } /* * The object must be locked. */ void vm_pager_deallocate(object) vm_object_t object; { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); (*pagertab[object->type]->pgo_dealloc) (object); } /* * vm_pager_get_pages() - inline, see vm/vm_pager.h * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h */ vm_object_t vm_pager_object_lookup(pg_list, handle) struct pagerlst *pg_list; void *handle; { vm_object_t object; TAILQ_FOREACH(object, pg_list, pager_object_list) if (object->handle == handle) return (object); return (NULL); } /* * initialize a physical buffer */ /* * XXX This probably belongs in vfs_bio.c */ static void initpbuf(struct buf *bp) { bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_saveaddr = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva; bp->b_data = bp->b_saveaddr; bp->b_kvabase = bp->b_saveaddr; bp->b_kvasize = MAXPHYS; bp->b_xflags = 0; bp->b_flags = 0; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; bp->b_magic = B_MAGIC_BIO; bp->b_op = &buf_ops_bio; BUF_LOCK(bp, LK_EXCLUSIVE, NULL); } /* * allocate a physical buffer * * There are a limited number (nswbuf) of physical buffers. We need * to make sure that no single subsystem is able to hog all of them, * so each subsystem implements a counter which is typically initialized * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and * increments it on release, and blocks if the counter hits zero. A * subsystem may initialize the counter to -1 to disable the feature, * but it must still be sure to match up all uses of getpbuf() with * relpbuf() using the same variable. * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ struct buf * getpbuf(pfreecnt) int *pfreecnt; { int s; struct buf *bp; s = splvm(); mtx_lock(&pbuf_mtx); for (;;) { if (pfreecnt) { while (*pfreecnt == 0) { msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0); } } /* get a bp from the swap buffer header pool */ if ((bp = TAILQ_FIRST(&bswlist)) != NULL) break; bswneeded = 1; msleep(&bswneeded, &pbuf_mtx, PVM, "wswbuf1", 0); /* loop in case someone else grabbed one */ } TAILQ_REMOVE(&bswlist, bp, b_freelist); if (pfreecnt) --*pfreecnt; mtx_unlock(&pbuf_mtx); splx(s); initpbuf(bp); return bp; } /* * allocate a physical buffer, if one is available. * * Note that there is no NULL hack here - all subsystems using this * call understand how to use pfreecnt. */ struct buf * trypbuf(pfreecnt) int *pfreecnt; { int s; struct buf *bp; s = splvm(); mtx_lock(&pbuf_mtx); if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { mtx_unlock(&pbuf_mtx); splx(s); return NULL; } TAILQ_REMOVE(&bswlist, bp, b_freelist); --*pfreecnt; mtx_unlock(&pbuf_mtx); splx(s); initpbuf(bp); return bp; } /* * release a physical buffer * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ void relpbuf(bp, pfreecnt) struct buf *bp; int *pfreecnt; { int s; s = splvm(); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (bp->b_vp) pbrelvp(bp); BUF_UNLOCK(bp); mtx_lock(&pbuf_mtx); TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); if (bswneeded) { bswneeded = 0; wakeup(&bswneeded); } if (pfreecnt) { if (++*pfreecnt == 1) wakeup(pfreecnt); } mtx_unlock(&pbuf_mtx); splx(s); } Index: head/sys/vm/vm_pager.h =================================================================== --- head/sys/vm/vm_pager.h (revision 127960) +++ head/sys/vm/vm_pager.h (revision 127961) @@ -1,195 +1,191 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 * $FreeBSD$ */ /* * Pager routine interface definition. */ #ifndef _VM_PAGER_ #define _VM_PAGER_ #include TAILQ_HEAD(pagerlst, vm_object); struct bio; struct pagerops { void (*pgo_init)(void); /* Initialize pager. */ vm_object_t (*pgo_alloc)(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t); /* Allocate pager. */ void (*pgo_dealloc)(vm_object_t); /* Disassociate. */ int (*pgo_getpages)(vm_object_t, vm_page_t *, int, int); /* Get (read) page. */ void (*pgo_putpages)(vm_object_t, vm_page_t *, int, int, int *); /* Put (write) page. */ boolean_t (*pgo_haspage)(vm_object_t, vm_pindex_t, int *, int *); /* Does pager have page? */ void (*pgo_pageunswapped)(vm_page_t); }; extern struct pagerops defaultpagerops; extern struct pagerops swappagerops; extern struct pagerops vnodepagerops; extern struct pagerops devicepagerops; extern struct pagerops physpagerops; /* * get/put return values * OK operation was successful * BAD specified data was out of the accepted range * FAIL specified data was in range, but doesn't exist * PEND operations was initiated but not completed * ERROR error while accessing data that is in range and exists * AGAIN temporary resource shortage prevented operation from happening */ #define VM_PAGER_OK 0 #define VM_PAGER_BAD 1 #define VM_PAGER_FAIL 2 #define VM_PAGER_PEND 3 #define VM_PAGER_ERROR 4 #define VM_PAGER_AGAIN 5 #define VM_PAGER_PUT_SYNC 0x0001 #define VM_PAGER_PUT_INVAL 0x0002 #define VM_PAGER_IGNORE_CLEANCHK 0x0004 #define VM_PAGER_CLUSTER_OK 0x0008 #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VMPGDATA); #endif extern vm_map_t pager_map; extern int pager_map_size; extern struct pagerops *pagertab[]; extern struct mtx pbuf_mtx; vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t); void vm_pager_bufferinit(void); void vm_pager_deallocate(vm_object_t); static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int); static __inline boolean_t vm_pager_has_page(vm_object_t, vm_pindex_t, int *, int *); void vm_pager_init(void); vm_object_t vm_pager_object_lookup(struct pagerlst *, void *); /* * vm_page_get_pages: * * Retrieve pages from the VM system in order to map them into an object * ( or into VM space somewhere ). If the pagein was successful, we * must fully validate it. */ static __inline int vm_pager_get_pages( vm_object_t object, vm_page_t *m, int count, int reqpage ) { int r; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage); if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) { vm_page_zero_invalid(m[reqpage], TRUE); } return (r); } static __inline void vm_pager_put_pages( vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals ) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); (*pagertab[object->type]->pgo_putpages) (object, m, count, flags, rtvals); } /* * vm_pager_haspage * * Check to see if an object's pager has the requested page. The * object's pager will also set before and after to give the caller * some idea of the number of pages before and after the requested * page can be I/O'd efficiently. * * This routine does not have to be called at any particular spl. */ static __inline boolean_t vm_pager_has_page( vm_object_t object, vm_pindex_t offset, int *before, int *after ) { boolean_t ret; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); ret = (*pagertab[object->type]->pgo_haspage) (object, offset, before, after); return (ret); } /* * vm_pager_page_unswapped * * called at splvm() to destroy swap associated with the page. * * This function may not block. * * XXX: A much better name would be "vm_pager_page_dirtied()" * XXX: It is not obvious if this could be profitably used by any * XXX: pagers besides the swap_pager or if it should even be a * XXX: generic pager_op in the first place. */ static __inline void vm_pager_page_unswapped(vm_page_t m) { VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (pagertab[m->object->type]->pgo_pageunswapped) (*pagertab[m->object->type]->pgo_pageunswapped)(m); } #endif /* _KERNEL */ #endif /* _VM_PAGER_ */ Index: head/sys/vm/vm_param.h =================================================================== --- head/sys/vm/vm_param.h (revision 127960) +++ head/sys/vm/vm_param.h (revision 127961) @@ -1,145 +1,141 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_param.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Machine independent virtual memory parameters. */ #ifndef _VM_PARAM_ #define _VM_PARAM_ #include /* * CTL_VM identifiers */ #define VM_TOTAL 1 /* struct vmtotal */ #define VM_METER VM_TOTAL/* deprecated, use VM_TOTAL */ #define VM_LOADAVG 2 /* struct loadavg */ #define VM_V_FREE_MIN 3 /* cnt.v_free_min */ #define VM_V_FREE_TARGET 4 /* cnt.v_free_target */ #define VM_V_FREE_RESERVED 5 /* cnt.v_free_reserved */ #define VM_V_INACTIVE_TARGET 6 /* cnt.v_inactive_target */ #define VM_V_CACHE_MIN 7 /* cnt.v_cache_max */ #define VM_V_CACHE_MAX 8 /* cnt.v_cache_min */ #define VM_V_PAGEOUT_FREE_MIN 9 /* cnt.v_pageout_free_min */ #define VM_PAGEOUT_ALGORITHM 10 /* pageout algorithm */ #define VM_SWAPPING_ENABLED 11 /* swapping enabled */ #define VM_MAXID 12 /* number of valid vm ids */ #define CTL_VM_NAMES { \ { 0, 0 }, \ { "vmtotal", CTLTYPE_STRUCT }, \ { "loadavg", CTLTYPE_STRUCT }, \ { "v_free_min", CTLTYPE_INT }, \ { "v_free_target", CTLTYPE_INT }, \ { "v_free_reserved", CTLTYPE_INT }, \ { "v_inactive_target", CTLTYPE_INT }, \ { "v_cache_min", CTLTYPE_INT }, \ { "v_cache_max", CTLTYPE_INT }, \ { "v_pageout_free_min", CTLTYPE_INT}, \ { "pageout_algorithm", CTLTYPE_INT}, \ { "swapping_enabled", CTLTYPE_INT},\ } /* * Structure for swap device statistics */ #define XSWDEV_VERSION 1 struct xswdev { u_int xsw_version; udev_t xsw_dev; int xsw_flags; int xsw_nblks; int xsw_used; }; /* * Return values from the VM routines. */ #define KERN_SUCCESS 0 #define KERN_INVALID_ADDRESS 1 #define KERN_PROTECTION_FAILURE 2 #define KERN_NO_SPACE 3 #define KERN_INVALID_ARGUMENT 4 #define KERN_FAILURE 5 #define KERN_RESOURCE_SHORTAGE 6 #define KERN_NOT_RECEIVER 7 #define KERN_NO_ACCESS 8 #ifndef ASSEMBLER #ifdef _KERNEL #define num_pages(x) \ ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) >> PAGE_SHIFT)) extern u_quad_t maxtsiz; extern u_quad_t dfldsiz; extern u_quad_t maxdsiz; extern u_quad_t dflssiz; extern u_quad_t maxssiz; extern u_quad_t sgrowsiz; #endif /* _KERNEL */ #endif /* ASSEMBLER */ #endif /* _VM_PARAM_ */ Index: head/sys/vm/vm_unix.c =================================================================== --- head/sys/vm/vm_unix.c (revision 127960) +++ head/sys/vm/vm_unix.c (revision 127961) @@ -1,180 +1,176 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$ * * @(#)vm_unix.c 8.1 (Berkeley) 6/11/93 */ /* * Traditional sbrk/grow interface to VM */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct obreak_args { char *nsize; }; #endif /* * MPSAFE */ /* ARGSUSED */ int obreak(td, uap) struct thread *td; struct obreak_args *uap; { struct vmspace *vm = td->td_proc->p_vmspace; vm_offset_t new, old, base; rlim_t datalim, vmemlim; int rv; int error = 0; boolean_t do_map_wirefuture; PROC_LOCK(td->td_proc); datalim = lim_cur(td->td_proc, RLIMIT_DATA); vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM); PROC_UNLOCK(td->td_proc); do_map_wirefuture = FALSE; new = round_page((vm_offset_t)uap->nsize); vm_map_lock(&vm->vm_map); base = round_page((vm_offset_t) vm->vm_daddr); old = base + ctob(vm->vm_dsize); if (new > base) { /* * Check the resource limit, but allow a process to reduce * its usage, even if it remains over the limit. */ if (new - base > datalim && new > old) { error = ENOMEM; goto done; } if (new > vm_map_max(&vm->vm_map)) { error = ENOMEM; goto done; } } else if (new < base) { /* * This is simply an invalid value. If someone wants to * do fancy address space manipulations, mmap and munmap * can do most of what the user would want. */ error = EINVAL; goto done; } if (new > old) { if (vm->vm_map.size + (new - old) > vmemlim) { error = ENOMEM; goto done; } rv = vm_map_insert(&vm->vm_map, NULL, 0, old, new, VM_PROT_ALL, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { error = ENOMEM; goto done; } vm->vm_dsize += btoc(new - old); /* * Handle the MAP_WIREFUTURE case for legacy applications, * by marking the newly mapped range of pages as wired. * We are not required to perform a corresponding * vm_map_unwire() before vm_map_delete() below, as * it will forcibly unwire the pages in the range. * * XXX If the pages cannot be wired, no error is returned. */ if ((vm->vm_map.flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) { if (bootverbose) printf("obreak: MAP_WIREFUTURE set\n"); do_map_wirefuture = TRUE; } } else if (new < old) { rv = vm_map_delete(&vm->vm_map, new, old); if (rv != KERN_SUCCESS) { error = ENOMEM; goto done; } vm->vm_dsize -= btoc(old - new); } done: vm_map_unlock(&vm->vm_map); if (do_map_wirefuture) (void) vm_map_wire(&vm->vm_map, old, new, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ovadvise_args { int anom; }; #endif /* * MPSAFE */ /* ARGSUSED */ int ovadvise(td, uap) struct thread *td; struct ovadvise_args *uap; { /* START_GIANT_OPTIONAL */ /* END_GIANT_OPTIONAL */ return (EINVAL); } Index: head/sys/vm/vnode_pager.h =================================================================== --- head/sys/vm/vnode_pager.h (revision 127960) +++ head/sys/vm/vnode_pager.h (revision 127961) @@ -1,59 +1,55 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode_pager.h 8.1 (Berkeley) 6/11/93 * $FreeBSD$ */ #ifndef _VNODE_PAGER_ #define _VNODE_PAGER_ 1 #ifdef _KERNEL vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t); struct vnode *vnode_pager_lock(vm_object_t); /* * XXX Generic routines; currently called by badly written FS code; these * XXX should go away soon. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage); int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m, int count, boolean_t sync, int *rtvals); #endif /* _KERNEL */ #endif /* _VNODE_PAGER_ */