Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 17333) +++ head/sys/amd64/amd64/pmap.c (revision 17334) @@ -1,2549 +1,2669 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.111 1996/07/28 20:31:27 dyson Exp $ + * $Id: pmap.c,v 1.113 1996/07/29 14:22:46 dyson Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PMAP_KEEP_PDIRS #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif -#if !defined(SMALL_KERNEL) -#define PMAP_INLINE __inline -#else -#define PMAP_INLINE -#endif - static void init_pv_entries __P((int)); /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; static struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int nkpt; static vm_page_t nkpg; vm_offset_t kernel_vm_end; extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; #define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ static int pv_freelistcnt; -TAILQ_HEAD (,pv_entry) pv_freelist; +static pv_entry_t pv_freelist; static vm_offset_t pvva; static int npvvapg; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1; static pt_entry_t *CMAP2, *ptmmap; +static pv_entry_t *pv_table; caddr_t CADDR1, ptvmmap; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp; pt_entry_t *PMAP1; unsigned *PADDR1; static void free_pv_entry __P((pv_entry_t pv)); -static unsigned * get_ptbase __P((pmap_t pmap)); +static __inline unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static void pmap_alloc_pv_entry __P((void)); +static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static int pmap_is_managed __P((vm_offset_t pa)); -static int pmap_remove_all __P((vm_offset_t pa)); -static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, - vm_offset_t pa, vm_page_t mpte)); +static void pmap_remove_all __P((vm_offset_t pa)); +static void pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, + vm_offset_t pa)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); -static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, +static __inline int pmap_remove_entry __P((struct pmap *pmap, pv_entry_t *pv, vm_offset_t va)); -static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, +static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); +static __inline void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); -static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); +static __inline vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); -static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); +static __inline int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, int ptepindex)); -unsigned * __pure pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); -int pmap_tcbit __P((vm_offset_t pa, int bit)); -static vm_page_t pmap_page_alloc __P((vm_object_t object, vm_pindex_t pindex)); -#define PDSTACKMAX 6 +#define VATRACK 4 +#define PDSTACKMAX 16 static vm_offset_t pdstack[PDSTACKMAX]; static int pdstackptr; /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD); kernel_pmap->pm_count = 1; - TAILQ_INIT(&kernel_pmap->pm_pvlist.pv_list); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufmap is used to map the system message buffer. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0; pmap_update(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t npg, s; int i; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ - s = (vm_size_t) (sizeof(pv_table_t) * npg); + s = (vm_size_t) (sizeof(struct pv_entry *) * npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); - pv_table = (pv_table_t *) addr; - for(i=0;i= clean_eva)) return 1; else return 0; } /* * The below are finer grained pmap_update routines. These eliminate * the gratuitious tlb flushes on non-i386 architectures. */ -static PMAP_INLINE void +static __inline void pmap_update_1pg( vm_offset_t va) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) pmap_update(); else #endif __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va)); } -static PMAP_INLINE void +static __inline void pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { pmap_update(); } else #endif { __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1)); __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2)); } } static __pure unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); pmap_update(); } return (unsigned *) APTmap; } /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ -unsigned * __pure +__inline unsigned * __pure pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { return get_ptbase(pmap) + i386_btop(va); } return (0); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * pmap_update calls. */ -unsigned * __pure +__inline unsigned * __pure pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { - unsigned pde, newpf; + unsigned pde; if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap + i386_btop(va); } - newpf = pde & PG_FRAME; - if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { - * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; - pmap_update_1pg((vm_offset_t) PADDR1); - } + * (int *) PMAP1 = (pde & PG_FRAME) | PG_V | PG_RW; + pmap_update_1pg((vm_offset_t) PADDR1); return PADDR1 + ((unsigned) i386_btop(va) & (NPTEPG - 1)); } return (0); } + /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t __pure pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { unsigned *pte; pte = get_ptbase(pmap) + i386_btop(va); return ((*pte & PG_FRAME) | (va & PAGE_MASK)); } return 0; } /* * determine if a page is managed (memory vs. device) */ -static PMAP_INLINE __pure int +static __inline __pure int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V; unsigned opte; pte = (unsigned *)vtopte(tva); opte = *pte; *pte = npte; if (opte) pmap_update_1pg(tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { pte = (unsigned *)vtopte(va); *pte = 0; pmap_update_1pg(va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a pmap_update after doing the pmap_kenter... */ -PMAP_INLINE void +__inline void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; if (opte) pmap_update_1pg(va); } /* * remove a page from the kernel pagetables */ -PMAP_INLINE void +__inline void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; pmap_update_1pg(va); } -static vm_page_t -pmap_page_alloc(object, pindex) - vm_object_t object; - vm_pindex_t pindex; + +/*************************************************** + * Page table page management routines..... + ***************************************************/ + +/* + * This routine unholds page table pages, and if the hold count + * drops to zero, then it decrements the wire count. + */ +static __inline int +pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { + vm_page_unhold(m); + if (m->hold_count == 0) { + vm_offset_t pteva; + /* + * unmap the page table page + */ + pmap->pm_pdir[m->pindex] = 0; + --pmap->pm_stats.resident_count; + /* + * Do a pmap_update to make the invalidated mapping + * take effect immediately. + */ + pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); + pmap_update_1pg(pteva); + /* + * If the page is finally unwired, simply free it. + */ + --m->wire_count; + if (m->wire_count == 0) { + vm_page_free_zero(m); + --cnt.v_wire_count; + } + return 1; + } + return 0; +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +int +pmap_unuse_pt(pmap, va, mpte) + pmap_t pmap; + vm_offset_t va; + vm_page_t mpte; { - vm_page_t m; - m = vm_page_alloc(object, pindex, VM_ALLOC_ZERO); - if (m == NULL) { - VM_WAIT; + if (va >= UPT_MIN_ADDRESS) + return 0; + + if (mpte == NULL) { + vm_offset_t ptepa; + ptepa = ((vm_offset_t) *pmap_pde(pmap, va)); +#if defined(PMAP_DIAGNOSTIC) + if (!ptepa) + panic("pmap_unuse_pt: pagetable page missing, va: 0x%x", va); +#endif + if (!ptepa) + return 0; + mpte = PHYS_TO_VM_PAGE(ptepa); } - return m; + +#if defined(PMAP_DIAGNOSTIC) + if (mpte->pindex != (va >> PDRSHIFT)) + panic("pmap_unuse_pt: pindex(0x%x) != va(0x%x)", + mpte->pindex, (va >> PDRSHIFT)); + + if (mpte->hold_count == 0) { + panic("pmap_unuse_pt: hold count < 0, va: 0x%x", va); + } +#endif + + return pmap_unwire_pte_hold(pmap, mpte); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pdstackptr > 0) { --pdstackptr; pmap->pm_pdir = (pd_entry_t *)pdstack[pdstackptr]; } else { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); } /* * allocate object for the ptes */ pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ retry: - ptdpg = pmap_page_alloc( pmap->pm_pteobj, PTDPTDI); - if (ptdpg == NULL) + ptdpg = vm_page_alloc( pmap->pm_pteobj, PTDPTDI, VM_ALLOC_ZERO); + if (ptdpg == NULL) { + VM_WAIT; goto retry; - - ptdpg->wire_count = 1; - ++cnt.v_wire_count; + } + vm_page_wire(ptdpg); ptdpg->flags &= ~(PG_MAPPED|PG_BUSY); /* not mapped normally */ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); /* wire in kernel global address entries */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW; pmap->pm_count = 1; - TAILQ_INIT(&pmap->pm_pvlist.pv_list); } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { int s; unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ s = splvm(); if (p->flags & PG_BUSY) { p->flags |= PG_WANTED; tsleep(p, PVM, "pmaprl", 0); splx(s); return 0; } /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; --pmap->pm_stats.resident_count; if (p->hold_count) { + int *kvap; + int i; +#if defined(PMAP_DIAGNOSTIC) panic("pmap_release: freeing held page table page"); +#else + printf("pmap_release: freeing held page table page:\n"); +#endif + kvap = (int *)vm_pager_map_page(p); + for(i=0;ipindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } vm_page_free_zero(p); splx(s); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; int ptepindex; { - vm_offset_t ptepa; + vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ retry: m = vm_page_lookup(pmap->pm_pteobj, ptepindex); if (m == NULL) { - m = pmap_page_alloc(pmap->pm_pteobj, ptepindex); - if (m == NULL) + m = vm_page_alloc(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO); + if (m == NULL) { + VM_WAIT; goto retry; + } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->flags &= ~(PG_ZERO|PG_BUSY); m->valid = VM_PAGE_BITS_ALL; } else { if ((m->flags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED; tsleep(m, PVM, "ptewai", 0); goto retry; } } + /* + * mark the object writeable + */ + pmap->pm_pteobj->flags |= OBJ_WRITEABLE; + if (m->queue != PQ_NONE) { int s = splvm(); - vm_page_unqueue(m,1); + vm_page_unqueue(m); splx(s); } - if (m->wire_count == 0) - ++cnt.v_wire_count; - ++m->wire_count; - + if (m->hold_count == 0) { + if (m->wire_count == 0) + ++cnt.v_wire_count; + ++m->wire_count; + } /* * Increment the hold count for the page table page * (denoting a new mapping.) */ ++m->hold_count; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V); + pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); + pmap_update_1pg(pteva); m->flags |= PG_MAPPED; return m; } -PMAP_INLINE static vm_page_t +static __inline vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { int ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { - m = vm_page_lookup( pmap->pm_pteobj, ptepindex); + m = PHYS_TO_VM_PAGE(ptepa); ++m->hold_count; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); ptdpg = NULL; retry: for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } if (!pmap_release_free_page(pmap, p)) goto retry; } if (ptdpg == NULL) panic("pmap_release: missing page table directory page"); if (!pmap_release_free_page(pmap, ptdpg)) goto retry; vm_object_deallocate(object); if (pdstackptr < PDSTACKMAX) { pdstack[pdstackptr] = (vm_offset_t) pmap->pm_pdir; ++pdstackptr; } else { kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE); } } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); ++nkpt; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } ++nkpt; if (!nkpg) { nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); pmap_zero_page(VM_PAGE_TO_PHYS(nkpg)); } pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_RW); nkpg = NULL; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); } } *pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); free((caddr_t) pmap, M_VMPMAP); } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ -static PMAP_INLINE void +static __inline void free_pv_entry(pv) pv_entry_t pv; { ++pv_freelistcnt; - TAILQ_INSERT_HEAD(&pv_freelist, pv, pv_list); + pv->pv_next = pv_freelist; + pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ -static PMAP_INLINE pv_entry_t +static __inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ - if (pv_freelistcnt < PV_FREELIST_MIN) { + if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } - /* * get a pv_entry off of the free list */ --pv_freelistcnt; - tmp = TAILQ_FIRST(&pv_freelist); - TAILQ_REMOVE(&pv_freelist, tmp, pv_list); + tmp = pv_freelist; + pv_freelist = tmp->pv_next; return tmp; } /* * This *strange* allocation routine eliminates the possibility of a malloc * failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... * We really need to do the slab allocator thingie here. */ static void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * allocate a physical page out of the vm system */ m = vm_page_alloc(kernel_object, OFF_TO_IDX(pvva - vm_map_min(kernel_map)), VM_ALLOC_INTERRUPT); if (m) { int newentries; int i; pv_entry_t entry; newentries = (PAGE_SIZE / sizeof(struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ pvva += PAGE_SIZE; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } } - if (TAILQ_FIRST(&pv_freelist) == NULL) + if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic if * this is too small.) */ npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + PAGE_SIZE - 1) / PAGE_SIZE; pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE); /* * get the first batch of entries */ pmap_alloc_pv_entry(); } /* - * This routine unholds page table pages, and if the hold count - * drops to zero, then it decrements the wire count. + * If it is the first entry on the list, it is actually + * in the header and we must copy the following entry up + * to the header. Otherwise we must search the list for + * the entry. In either case we free the now unused entry. */ -static int -pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { - vm_page_unhold(m); - if (m->hold_count == 0) { - vm_offset_t pteva; - /* - * unmap the page table page - */ - pmap->pm_pdir[m->pindex] = 0; - --pmap->pm_stats.resident_count; - if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == - (((unsigned) PTDpde) & PG_FRAME)) { - /* - * Do a pmap_update to make the invalidated mapping - * take effect immediately. - */ - pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); - pmap_update_1pg(pteva); +static __inline int +pmap_remove_entry(pmap, ppv, va) + struct pmap *pmap; + pv_entry_t *ppv; + vm_offset_t va; +{ + pv_entry_t npv; + int s; + + s = splvm(); + for (npv = *ppv; npv; (ppv = &npv->pv_next, npv = *ppv)) { + if (pmap == npv->pv_pmap && va == npv->pv_va) { + int rtval = pmap_unuse_pt(pmap, va, npv->pv_ptem); + *ppv = npv->pv_next; + free_pv_entry(npv); + splx(s); + return rtval; } - /* - * If the page is finally unwired, simply free it. - */ - --m->wire_count; - if (m->wire_count == 0) { - vm_page_free_zero(m); - --cnt.v_wire_count; - } - return 1; } + splx(s); return 0; } /* - * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * Create a pv entry for page at pa for + * (pmap, va). */ -PMAP_INLINE int -pmap_unuse_pt(pmap, va, mpte) +static __inline void +pmap_insert_entry(pmap, va, mpte, pa) pmap_t pmap; vm_offset_t va; vm_page_t mpte; + vm_offset_t pa; { - int ptepindex; - if (va >= UPT_MIN_ADDRESS) - return 0; - if (mpte == NULL) { - ptepindex = (va >> PDRSHIFT); - mpte = vm_page_lookup( pmap->pm_pteobj, ptepindex); - } - - return pmap_unwire_pte_hold(pmap, mpte); -} - -/* - * If it is the first entry on the list, it is actually - * in the header and we must copy the following entry up - * to the header. Otherwise we must search the list for - * the entry. In either case we free the now unused entry. - */ -static int -pmap_remove_entry(pmap, ppv, va) - struct pmap *pmap; - pv_table_t *ppv; - vm_offset_t va; -{ - pv_entry_t pv; - int rtval; int s; + pv_entry_t *ppv, pv; s = splvm(); - if (ppv->pv_list_count < pmap->pm_stats.resident_count) { - for (pv = TAILQ_FIRST(&ppv->pv_list); - pv; - pv = TAILQ_NEXT(pv, pv_list)) { - if (pmap == pv->pv_pmap && va == pv->pv_va) - break; - } - } else { - for (pv = TAILQ_FIRST(&pmap->pm_pvlist.pv_list); - pv; - pv = TAILQ_NEXT(pv, pv_plist)) { - if (va == pv->pv_va) - break; - } - } + pv = get_pv_entry(); + pv->pv_va = va; + pv->pv_pmap = pmap; + pv->pv_ptem = mpte; - rtval = 0; - if (pv) { - rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); - TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); - --ppv->pv_list_count; - TAILQ_REMOVE(&pmap->pm_pvlist.pv_list, pv, pv_plist); - free_pv_entry(pv); - } - + ppv = pa_to_pvh(pa); + if (*ppv) + pv->pv_next = *ppv; + else + pv->pv_next = NULL; + *ppv = pv; splx(s); - return rtval; } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; + pv_entry_t *ppv; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte); } #endif if (pmap_track_modified(va)) PHYS_TO_VM_PAGE(oldpte)->dirty = VM_PAGE_BITS_ALL; } - return pmap_remove_entry(pmap, pa_to_pvh(oldpte), va); + ppv = pa_to_pvh(oldpte); + return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); pmap_update_1pg(va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; vm_page_t mpte; int anyvalid; + vm_offset_t vachanged[VATRACK]; if (pmap == NULL) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); - mpte = NULL; for (; sindex < eindex; sindex = pdnxt) { /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); ptpaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sindex)); /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; + if (sindex < i386_btop(UPT_MIN_ADDRESS)) { /* + * get the vm_page_t for the page table page + */ + mpte = PHYS_TO_VM_PAGE(ptpaddr); + + /* + * if the pte isn't wired, just skip it. + */ + if (mpte->wire_count == 0) + continue; + } + + /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); + if (anyvalid < VATRACK) + vachanged[anyvalid] = va; anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) { - pmap_update(); + if (anyvalid <= VATRACK) { + int i; + for(i=0;ipv_list); - pv; - pv = npv) { + for (pv = *ppv; pv; pv=pv->pv_next) { pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (tpte = *pte) { pv->pv_pmap->pm_stats.resident_count--; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ - if ((tpte & (PG_M|PG_MANAGED)) == (PG_M|PG_MANAGED)) { + if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte); } #endif - if (pmap_track_modified(pv->pv_va)) - nmodify += 1; + if (pmap_track_modified(pv->pv_va)) { + if (m == NULL) + m = PHYS_TO_VM_PAGE(pa); + m->dirty = VM_PAGE_BITS_ALL; + } } } - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist.pv_list, pv, pv_plist); + } - npv = TAILQ_NEXT(pv, pv_list); - TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); - --ppv->pv_list_count; + for (pv = *ppv; pv; pv = npv) { + npv = pv->pv_next; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } - + *ppv = NULL; splx(s); - return nmodify; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; vm_page_t mpte; int anyvalid; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anyvalid = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); - mpte = NULL; for (; sindex < eindex; sindex = pdnxt) { pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); ptpaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sindex)); /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; + /* + * Skip page ranges, where the page table page isn't wired. + * If the page table page is not wired, there are no page mappings + * there. + */ + if (sindex < i386_btop(UPT_MIN_ADDRESS)) { + mpte = PHYS_TO_VM_PAGE(ptpaddr); + + if (mpte->wire_count == 0) + continue; + } + if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits = ptbase[sindex]; - if ((pbits & (PG_RW|PG_V)) == (PG_RW|PG_V)) { - if ((pbits & (PG_M|PG_MANAGED)) == (PG_M|PG_MANAGED)) { + if (pbits & PG_RW) { + if (pbits & PG_M) { vm_offset_t sva = i386_ptob(sindex); if (pmap_track_modified(sva)) { vm_page_t m = PHYS_TO_VM_PAGE(pbits); m->dirty = VM_PAGE_BITS_ALL; } } ptbase[sindex] = pbits & ~(PG_M|PG_RW); anyvalid = 1; } } } if (anyvalid) pmap_update(); } /* - * Create a pv entry for page at pa for - * (pmap, va). - */ -static void -pmap_insert_entry(pmap, va, mpte, pa) - pmap_t pmap; - vm_offset_t va; - vm_page_t mpte; - vm_offset_t pa; -{ - - int s; - pv_entry_t pv; - pv_table_t *ppv; - - s = splvm(); - pv = get_pv_entry(); - pv->pv_va = va; - pv->pv_pmap = pmap; - pv->pv_ptem = mpte; - - TAILQ_INSERT_TAIL(&pmap->pm_pvlist.pv_list, pv, pv_plist); - - ppv = pa_to_pvh(pa); - TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); - ++ppv->pv_list_count; - - splx(s); -} - -/* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) mpte = pmap_allocpte(pmap, va); - pte = pmap_pte(pmap, va); + pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n", pmap->pm_pdir[PTDPTDI], va); } origpte = *(vm_offset_t *)pte; pa &= PG_FRAME; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ - if (origpte && (opa == pa)) { + if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte); } #endif /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { - if ((origpte & PG_M) && pmap_track_modified(va)) { - vm_page_t m; - m = PHYS_TO_VM_PAGE(pa); - m->dirty = VM_PAGE_BITS_ALL; + vm_page_t m; + if (origpte & PG_M) { + if (pmap_track_modified(va)) { + m = PHYS_TO_VM_PAGE(pa); + m->dirty = VM_PAGE_BITS_ALL; + } } pa |= PG_MANAGED; } if (mpte) --mpte->hold_count; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ - if (origpte) { + if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte; if (origpte) pmap_update_1pg(va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ -static vm_page_t -pmap_enter_quick(pmap, va, pa, mpte) +static void +pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; - vm_page_t mpte; { register unsigned *pte; + vm_page_t mpte; + mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ - if (va < UPT_MIN_ADDRESS) { - int ptepindex; - vm_offset_t ptepa; + if (va < UPT_MIN_ADDRESS) + mpte = pmap_allocpte(pmap, va); - /* - * Calculate pagetable page index - */ - ptepindex = va >> PDRSHIFT; - if (mpte && (mpte->pindex == ptepindex)) { - ++mpte->hold_count; - } else { - /* - * Get the page directory entry - */ - ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; - - /* - * If the page table page is mapped, we just increment - * the hold count, and activate it. - */ - if (ptepa) { - mpte = vm_page_lookup( pmap->pm_pteobj, ptepindex); - ++mpte->hold_count; - } else { - mpte = _pmap_allocpte(pmap, ptepindex); - } - } - } else { - mpte = NULL; - } - /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); - return NULL; + return; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pa | PG_V | PG_U | PG_MANAGED; - return mpte; + return; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; - vm_page_t p, mpte; + vm_page_t p; int objpgs; psize = i386_btop(size); if (!pmap || (object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; - mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if (p->queue == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; - mpte = pmap_enter_quick(pmap, + pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), - VM_PAGE_TO_PHYS(p), mpte); + VM_PAGE_TO_PHYS(p)); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if (p->queue == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; - mpte = pmap_enter_quick(pmap, + pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), - VM_PAGE_TO_PHYS(p), mpte); + VM_PAGE_TO_PHYS(p)); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 2 #define PFFOR 2 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry, object) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; vm_object_t object; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; - vm_page_t m, mpte; + vm_page_t m; if (entry->object.vm_object != object) return; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } - mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if (m->queue == PQ_CACHE) { vm_page_deactivate(m); } m->flags |= PG_BUSY; - mpte = pmap_enter_quick(pmap, addr, - VM_PAGE_TO_PHYS(m), mpte); + pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m)); m->flags |= PG_MAPPED; PAGE_WAKEUP(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } + + /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) return; dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); pmap_update(); } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; - int ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); - pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); - ptepindex = addr >> PDRSHIFT; - - srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; - if (srcptepaddr == 0) + srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[addr >> PDRSHIFT]; + if (srcptepaddr == 0) { continue; + } - srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); if (srcmpte->hold_count == 0) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* - * Clear the modified and - * accessed (referenced) bits - * during the copy. + * Simply clear the modified and accessed (referenced) + * bits. */ *dst_pte = ptetemp & ~(PG_M|PG_A); dst_pmap->pm_stats.resident_count++; - pmap_insert_entry(dst_pmap, addr, - dstmpte, + pmap_insert_entry(dst_pmap, addr, dstmpte, (ptetemp & PG_FRAME)); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; ++src_pte; ++dst_pte; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME); bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; pmap_update_1pg((vm_offset_t) CADDR2); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *) CMAP1 = PG_V | PG_RW | (src & PG_FRAME); *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME); bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { - register pv_entry_t pv; - pv_table_t *ppv; + register pv_entry_t *ppv, pv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ - for (pv = TAILQ_FIRST(&ppv->pv_list); - pv; - pv = TAILQ_NEXT(pv, pv_list)) { + for (pv = *ppv; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } -#define PMAP_REMOVE_PAGES_CURPROC_ONLY /* - * Remove all pages from specified address space - * this aids process exit speeds. Also, this code - * is special cased for current process only. + * pmap_testbit tests bits in pte's + * note that the testbit/changebit routines are inline, + * and a lot of things compile-time evaluate. */ -void -pmap_remove_pages(pmap, sva, eva) - pmap_t pmap; - vm_offset_t sva, eva; +static __inline boolean_t +pmap_testbit(pa, bit) + register vm_offset_t pa; + int bit; { - unsigned *pte, tpte; - pv_table_t *ppv; - pv_entry_t pv, npv; + register pv_entry_t *ppv, pv; + unsigned *pte; int s; -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY - if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { - printf("warning: pmap_remove_pages called with non-current pmap\n"); - return; - } -#endif + if (!pmap_is_managed(pa)) + return FALSE; + ppv = pa_to_pvh(pa); + if (*ppv == NULL) + return FALSE; + s = splvm(); + /* + * Not found, check current mappings returning immediately if found. + */ + for (pv = *ppv ;pv; pv = pv->pv_next) { - for(pv = TAILQ_FIRST(&pmap->pm_pvlist.pv_list); - pv; - pv = npv) { + /* + * if the bit being tested is the modified bit, then + * mark clean_map and ptes as never + * modified. + */ + if (bit & (PG_A|PG_M)) { + if (!pmap_track_modified(pv->pv_va)) + continue; + } - if (pv->pv_va >= eva || pv->pv_va < sva) { - npv = TAILQ_NEXT(pv, pv_plist); + if (!pv->pv_pmap) { +#if defined(PMAP_DIAGNOSTIC) + printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); +#endif continue; } - -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY - pte = (unsigned *)vtopte(pv->pv_va); -#else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); -#endif - tpte = *pte; - *pte = 0; - - if (tpte) { - pv->pv_pmap->pm_stats.resident_count--; - if (tpte & PG_W) - pv->pv_pmap->pm_stats.wired_count--; - /* - * Update the vm_page_t clean and reference bits. - */ - if (tpte & PG_M) { - PHYS_TO_VM_PAGE(tpte)->dirty = VM_PAGE_BITS_ALL; - } + if (pte == NULL) + continue; + if (*pte & bit) { + splx(s); + return TRUE; } + } + splx(s); + return (FALSE); +} - npv = TAILQ_NEXT(pv, pv_plist); - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist.pv_list, pv, pv_plist); +/* + * this routine is used to modify bits in ptes + */ +static __inline void +pmap_changebit(pa, bit, setem) + vm_offset_t pa; + int bit; + boolean_t setem; +{ + register pv_entry_t pv, *ppv; + register unsigned *pte; + vm_offset_t va; + int changed; + int s; - ppv = pa_to_pvh(tpte); - --ppv->pv_list_count; - TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); + if (!pmap_is_managed(pa)) + return; - pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); - free_pv_entry(pv); + s = splvm(); + changed = 0; + ppv = pa_to_pvh(pa); + /* + * Loop over all current mappings setting/clearing as appropos If + * setting RO do we need to clear the VAC? + */ + for ( pv = *ppv; pv; pv = pv->pv_next) { + va = pv->pv_va; + + /* + * don't write protect pager mappings + */ + if (!setem && (bit == PG_RW)) { + if (va >= clean_sva && va < clean_eva) + continue; + } + if (!pv->pv_pmap) { +#if defined(PMAP_DIAGNOSTIC) + printf("Null pmap (cb) at va: 0x%lx\n", va); +#endif + continue; + } + + pte = pmap_pte_quick(pv->pv_pmap, va); + if (pte == NULL) + continue; + if (setem) { + *(int *)pte |= bit; + changed = 1; + } else { + vm_offset_t pbits = *(vm_offset_t *)pte; + if (pbits & bit) + changed = 1; + if (bit == PG_RW) { + if (pbits & PG_M) { + vm_page_t m; + vm_offset_t pa = pbits & PG_FRAME; + m = PHYS_TO_VM_PAGE(pa); + m->dirty = VM_PAGE_BITS_ALL; + } + *(int *)pte = pbits & ~(PG_M|PG_RW); + } else { + *(int *)pte = pbits & ~bit; + } + } } - pmap_update(); splx(s); + if (changed) + pmap_update(); } +/* + * pmap_page_protect: + * + * Lower the permission for all mappings to a given page. + */ +void +pmap_page_protect(phys, prot) + vm_offset_t phys; + vm_prot_t prot; +{ + if ((prot & VM_PROT_WRITE) == 0) { + if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { + pmap_changebit(phys, PG_RW, FALSE); + } else { + pmap_remove_all(phys); + pmap_update(); + } + } +} + vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* - * pmap_tcbit: + * pmap_is_referenced: * - * Return the count of bits for a page, clearing all of them. + * Return whether or not the specified physical page was referenced + * by any physical maps. + */ +boolean_t +pmap_is_referenced(vm_offset_t pa) +{ + register pv_entry_t *ppv, pv, lpv; + unsigned *pte; + int s; + + if (!pmap_is_managed(pa)) + return FALSE; + + ppv = pa_to_pvh(pa); + + s = splvm(); + /* + * Not found, check current mappings returning immediately if found. + */ + for (lpv = NULL, pv = *ppv ;pv; lpv = pv, pv = pv->pv_next) { + /* + * if the bit being tested is the modified bit, then + * mark clean_map and ptes as never + * modified. + */ + if (!pmap_track_modified(pv->pv_va)) + continue; + if (!pv->pv_pmap) { + continue; + } + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + if (pte == NULL) + continue; + if ((int) *pte & PG_A) { + if (lpv) { + lpv->pv_next = pv->pv_next; + pv->pv_next = *ppv; + *ppv = pv; + } + splx(s); + return TRUE; + } + } + splx(s); + return (FALSE); +} + +/* + * pmap_ts_referenced: + * + * Return the count of reference bits for a page, clearing all of them. * */ int -pmap_tcbit(vm_offset_t pa, int bit) +pmap_ts_referenced(vm_offset_t pa) { - register pv_entry_t pv, npv; - pv_table_t *ppv; + register pv_entry_t *ppv, pv; unsigned *pte; int s; int rtval = 0; + vm_offset_t vachanged[VATRACK]; + if (!pmap_is_managed(pa)) + return FALSE; + s = splvm(); ppv = pa_to_pvh(pa); + + if (*ppv == NULL) { + splx(s); + return 0; + } + /* * Not found, check current mappings returning immediately if found. */ - for (pv = TAILQ_FIRST(&ppv->pv_list); - pv; - pv = npv) { - npv = TAILQ_NEXT(pv, pv_list); + for (pv = *ppv ;pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ - if (((bit & PG_M) != 0) - && !pmap_track_modified(pv->pv_va)) + if (!pmap_track_modified(pv->pv_va)) continue; + if (!pv->pv_pmap) { + continue; + } pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte == NULL) continue; - - if ((rtval == 0) && (*pte & bit)) { - rtval = 1; + if (*pte & PG_A) { + if (rtval < VATRACK) + vachanged[rtval] = pv->pv_va; + rtval++; + *pte &= ~PG_A; } - *pte &= ~bit; } splx(s); - if (rtval) - pmap_update(); + if (rtval) { + if (rtval <= VATRACK) { + int i; + for(i=0;idirty = VM_PAGE_BITS_ALL; - - return rtval; + return pmap_testbit((pa), PG_M); } /* - * pmap_tc_referenced: - * - * Return the count of referenced bits for a page, clearing all of them. - * + * Clear the modify bits on the specified physical page. */ -int -pmap_tc_referenced(vm_offset_t pa) +void +pmap_clear_modify(vm_offset_t pa) { - if (!pmap_is_managed(pa)) - return 0; - return pmap_tcbit(pa, PG_A); + pmap_changebit((pa), PG_M, FALSE); } /* - * pmap_page_protect: + * pmap_clear_reference: * - * Lower the permission for all mappings to a given page. + * Clear the reference bit on the specified physical page. */ void -pmap_page_protect(m, prot) - vm_page_t m; - vm_prot_t prot; +pmap_clear_reference(vm_offset_t pa) { - if ((prot & VM_PROT_WRITE) == 0) { - if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { - if ((m->flags & PG_FICTITIOUS) == 0) - pmap_tcbit(VM_PAGE_TO_PHYS(m), PG_RW); - } else { - if (pmap_remove_all(VM_PAGE_TO_PHYS(m))) { - m->dirty = VM_PAGE_BITS_ALL; - } - pmap_update(); - } - } + pmap_changebit((pa), PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. The non-cacheable bits are set on each * mapped page. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; unsigned *pte; size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | PG_N; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_update(); return ((void *) va); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; int val = 0; - ptep = pmap_pte_quick(pmap, addr); + ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if (pte = *ptep) { vm_offset_t pa; val = MINCORE_INCORE; pa = pte & PG_FRAME; /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (PHYS_TO_VM_PAGE(pa)->dirty || - pmap_tcbit(pa, PG_M)) { + pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; - PHYS_TO_VM_PAGE(pa)->dirty = VM_PAGE_BITS_ALL; - } /* * Referenced by us */ if (pte & PG_U) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((PHYS_TO_VM_PAGE(pa)->flags & PG_REFERENCED) || - pmap_tcbit(pa, PG_A)) + pmap_is_referenced(pa)) val |= MINCORE_REFERENCED_OTHER; } return val; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } - pte = pmap_pte_quick( pmap, va); + pte = pmap_pte( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; - ptep = pmap_pte_quick(pm, va); + ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); - for (pv = TAILQ_FIRST(pa_to_pvh(pa)); - pv; - pv = TAILQ_NEXT(pv, pv_list)) { + for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/amd64/include/pmap.h =================================================================== --- head/sys/amd64/include/pmap.h (revision 17333) +++ head/sys/amd64/include/pmap.h (revision 17334) @@ -1,237 +1,224 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 - * $Id: pmap.h,v 1.40 1996/06/08 11:21:19 bde Exp $ + * $Id: pmap.h,v 1.41 1996/07/27 03:23:32 dyson Exp $ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ - /* * Page-directory and page-table entires follow this format, with a few * of the fields not present here and there, depending on a lot of things. */ /* ---- Intel Nomenclature ---- */ #define PG_V 0x001 /* P Valid */ #define PG_RW 0x002 /* R/W Read/Write */ #define PG_U 0x004 /* U/S User/Supervisor */ #define PG_NC_PWT 0x008 /* PWT Write through */ #define PG_NC_PCD 0x010 /* PCD Cache disable */ #define PG_A 0x020 /* A Accessed */ #define PG_M 0x040 /* D Dirty */ #define PG_PS 0x080 /* PS Page size (0=4k,1=4M) */ #define PG_G 0x100 /* G Global */ #define PG_AVAIL1 0x200 /* / Available for system */ #define PG_AVAIL2 0x400 /* < programmers use */ #define PG_AVAIL3 0x800 /* \ */ /* Our various interpretations of the above */ #define PG_W PG_AVAIL1 /* "Wired" pseudoflag */ #define PG_MANAGED PG_AVAIL2 #define PG_FRAME (~PAGE_MASK) #define PG_PROT (PG_RW|PG_U) /* all protection bits . */ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ /* * Page Protection Exception bits */ #define PGEX_P 0x01 /* Protection violation vs. not present */ #define PGEX_W 0x02 /* during a Write cycle */ #define PGEX_U 0x04 /* access from User mode (UPL) */ /* * Pte related macros */ #define VADDR(pdi, pti) ((vm_offset_t)(((pdi)< - typedef unsigned int *pd_entry_t; typedef unsigned int *pt_entry_t; #define PDESIZE sizeof(pd_entry_t) /* for assembly files */ #define PTESIZE sizeof(pt_entry_t) /* for assembly files */ /* * Address of current and alternate address space page table maps * and directories. */ #ifdef KERNEL extern pt_entry_t PTmap[], APTmap[], Upte; extern pd_entry_t PTD[], APTD[], PTDpde, APTDpde, Upde; extern int IdlePTD; /* physical address of "Idle" state directory */ #endif /* * virtual address to page table entry and * to physical address. Likewise for alternate address space. * Note: these work recursively, thus vtopte of a pte will give * the corresponding pde that in turn maps it. */ #define vtopte(va) (PTmap + i386_btop(va)) #define vtophys(va) (((int) (*vtopte(va))&PG_FRAME) | ((int)(va) & PAGE_MASK)) #define avtopte(va) (APTmap + i386_btop(va)) #define avtophys(va) (((int) (*avtopte(va))&PG_FRAME) | ((int)(va) & PAGE_MASK)) #ifdef KERNEL /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * kernel virtual address. */ static __inline vm_offset_t pmap_kextract(vm_offset_t va) { vm_offset_t pa = *(int *)vtopte(va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); return pa; } #endif -struct vm_page; - /* * Pmap stuff */ -struct pv_entry; -typedef struct { - int pv_list_count; - TAILQ_HEAD(,pv_entry) pv_list; -} pv_table_t; struct pmap { pd_entry_t *pm_pdir; /* KVA of page directory */ vm_object_t pm_pteobj; /* Container for pte's */ - pv_table_t pm_pvlist; /* list of mappings in pmap */ - int pm_count; /* reference count */ + short pm_dref; /* page directory ref count */ + short pm_count; /* pmap reference count */ struct pmap_statistics pm_stats; /* pmap statistics */ - struct vm_page *pm_ptphint; /* pmap ptp hint */ + struct vm_map *pm_map; /* map that owns this pmap */ }; typedef struct pmap *pmap_t; #ifdef KERNEL extern pmap_t kernel_pmap; #endif - /* * For each vm_page_t, there is a list of all currently valid virtual * mappings of that page. An entry is a pv_entry_t, the list is pv_table. */ typedef struct pv_entry { + struct pv_entry *pv_next; /* next pv_entry */ pmap_t pv_pmap; /* pmap where mapping lies */ vm_offset_t pv_va; /* virtual address for mapping */ - TAILQ_ENTRY(pv_entry) pv_list; - TAILQ_ENTRY(pv_entry) pv_plist; vm_page_t pv_ptem; /* VM page for pte */ } *pv_entry_t; #define PV_ENTRY_NULL ((pv_entry_t) 0) #define PV_CI 0x01 /* all entries must be cache inhibited */ #define PV_PTPAGE 0x02 /* entry maps a page table page */ #ifdef KERNEL extern caddr_t CADDR1; extern pt_entry_t *CMAP1; extern vm_offset_t avail_end; extern vm_offset_t avail_start; extern vm_offset_t phys_avail[]; -pv_table_t *pv_table; +extern pv_entry_t *pv_table; /* array of entries, one per page */ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; #define pa_index(pa) atop(pa - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define pmap_resident_count(pmap) ((pmap)->pm_stats.resident_count) struct pcb; void pmap_bootstrap __P(( vm_offset_t, vm_offset_t)); pmap_t pmap_kernel __P((void)); void *pmap_mapdev __P((vm_offset_t, vm_size_t)); unsigned * __pure pmap_pte __P((pmap_t, vm_offset_t)) __pure2; int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); vm_page_t pmap_use_pt __P((pmap_t, vm_offset_t)); #endif /* KERNEL */ #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 17333) +++ head/sys/i386/i386/pmap.c (revision 17334) @@ -1,2549 +1,2669 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.111 1996/07/28 20:31:27 dyson Exp $ + * $Id: pmap.c,v 1.113 1996/07/29 14:22:46 dyson Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PMAP_KEEP_PDIRS #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif -#if !defined(SMALL_KERNEL) -#define PMAP_INLINE __inline -#else -#define PMAP_INLINE -#endif - static void init_pv_entries __P((int)); /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; static struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int nkpt; static vm_page_t nkpg; vm_offset_t kernel_vm_end; extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; #define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ static int pv_freelistcnt; -TAILQ_HEAD (,pv_entry) pv_freelist; +static pv_entry_t pv_freelist; static vm_offset_t pvva; static int npvvapg; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1; static pt_entry_t *CMAP2, *ptmmap; +static pv_entry_t *pv_table; caddr_t CADDR1, ptvmmap; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp; pt_entry_t *PMAP1; unsigned *PADDR1; static void free_pv_entry __P((pv_entry_t pv)); -static unsigned * get_ptbase __P((pmap_t pmap)); +static __inline unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static void pmap_alloc_pv_entry __P((void)); +static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static int pmap_is_managed __P((vm_offset_t pa)); -static int pmap_remove_all __P((vm_offset_t pa)); -static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, - vm_offset_t pa, vm_page_t mpte)); +static void pmap_remove_all __P((vm_offset_t pa)); +static void pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, + vm_offset_t pa)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); -static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, +static __inline int pmap_remove_entry __P((struct pmap *pmap, pv_entry_t *pv, vm_offset_t va)); -static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, +static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); +static __inline void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); -static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); +static __inline vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); -static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); +static __inline int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, int ptepindex)); -unsigned * __pure pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); -int pmap_tcbit __P((vm_offset_t pa, int bit)); -static vm_page_t pmap_page_alloc __P((vm_object_t object, vm_pindex_t pindex)); -#define PDSTACKMAX 6 +#define VATRACK 4 +#define PDSTACKMAX 16 static vm_offset_t pdstack[PDSTACKMAX]; static int pdstackptr; /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD); kernel_pmap->pm_count = 1; - TAILQ_INIT(&kernel_pmap->pm_pvlist.pv_list); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufmap is used to map the system message buffer. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0; pmap_update(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t npg, s; int i; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ - s = (vm_size_t) (sizeof(pv_table_t) * npg); + s = (vm_size_t) (sizeof(struct pv_entry *) * npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); - pv_table = (pv_table_t *) addr; - for(i=0;i= clean_eva)) return 1; else return 0; } /* * The below are finer grained pmap_update routines. These eliminate * the gratuitious tlb flushes on non-i386 architectures. */ -static PMAP_INLINE void +static __inline void pmap_update_1pg( vm_offset_t va) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) pmap_update(); else #endif __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va)); } -static PMAP_INLINE void +static __inline void pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { pmap_update(); } else #endif { __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1)); __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2)); } } static __pure unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); pmap_update(); } return (unsigned *) APTmap; } /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ -unsigned * __pure +__inline unsigned * __pure pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { return get_ptbase(pmap) + i386_btop(va); } return (0); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * pmap_update calls. */ -unsigned * __pure +__inline unsigned * __pure pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { - unsigned pde, newpf; + unsigned pde; if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap + i386_btop(va); } - newpf = pde & PG_FRAME; - if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { - * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; - pmap_update_1pg((vm_offset_t) PADDR1); - } + * (int *) PMAP1 = (pde & PG_FRAME) | PG_V | PG_RW; + pmap_update_1pg((vm_offset_t) PADDR1); return PADDR1 + ((unsigned) i386_btop(va) & (NPTEPG - 1)); } return (0); } + /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t __pure pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { unsigned *pte; pte = get_ptbase(pmap) + i386_btop(va); return ((*pte & PG_FRAME) | (va & PAGE_MASK)); } return 0; } /* * determine if a page is managed (memory vs. device) */ -static PMAP_INLINE __pure int +static __inline __pure int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V; unsigned opte; pte = (unsigned *)vtopte(tva); opte = *pte; *pte = npte; if (opte) pmap_update_1pg(tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { pte = (unsigned *)vtopte(va); *pte = 0; pmap_update_1pg(va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a pmap_update after doing the pmap_kenter... */ -PMAP_INLINE void +__inline void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; if (opte) pmap_update_1pg(va); } /* * remove a page from the kernel pagetables */ -PMAP_INLINE void +__inline void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; pmap_update_1pg(va); } -static vm_page_t -pmap_page_alloc(object, pindex) - vm_object_t object; - vm_pindex_t pindex; + +/*************************************************** + * Page table page management routines..... + ***************************************************/ + +/* + * This routine unholds page table pages, and if the hold count + * drops to zero, then it decrements the wire count. + */ +static __inline int +pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { + vm_page_unhold(m); + if (m->hold_count == 0) { + vm_offset_t pteva; + /* + * unmap the page table page + */ + pmap->pm_pdir[m->pindex] = 0; + --pmap->pm_stats.resident_count; + /* + * Do a pmap_update to make the invalidated mapping + * take effect immediately. + */ + pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); + pmap_update_1pg(pteva); + /* + * If the page is finally unwired, simply free it. + */ + --m->wire_count; + if (m->wire_count == 0) { + vm_page_free_zero(m); + --cnt.v_wire_count; + } + return 1; + } + return 0; +} + +/* + * After removing a page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +int +pmap_unuse_pt(pmap, va, mpte) + pmap_t pmap; + vm_offset_t va; + vm_page_t mpte; { - vm_page_t m; - m = vm_page_alloc(object, pindex, VM_ALLOC_ZERO); - if (m == NULL) { - VM_WAIT; + if (va >= UPT_MIN_ADDRESS) + return 0; + + if (mpte == NULL) { + vm_offset_t ptepa; + ptepa = ((vm_offset_t) *pmap_pde(pmap, va)); +#if defined(PMAP_DIAGNOSTIC) + if (!ptepa) + panic("pmap_unuse_pt: pagetable page missing, va: 0x%x", va); +#endif + if (!ptepa) + return 0; + mpte = PHYS_TO_VM_PAGE(ptepa); } - return m; + +#if defined(PMAP_DIAGNOSTIC) + if (mpte->pindex != (va >> PDRSHIFT)) + panic("pmap_unuse_pt: pindex(0x%x) != va(0x%x)", + mpte->pindex, (va >> PDRSHIFT)); + + if (mpte->hold_count == 0) { + panic("pmap_unuse_pt: hold count < 0, va: 0x%x", va); + } +#endif + + return pmap_unwire_pte_hold(pmap, mpte); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pdstackptr > 0) { --pdstackptr; pmap->pm_pdir = (pd_entry_t *)pdstack[pdstackptr]; } else { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); } /* * allocate object for the ptes */ pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ retry: - ptdpg = pmap_page_alloc( pmap->pm_pteobj, PTDPTDI); - if (ptdpg == NULL) + ptdpg = vm_page_alloc( pmap->pm_pteobj, PTDPTDI, VM_ALLOC_ZERO); + if (ptdpg == NULL) { + VM_WAIT; goto retry; - - ptdpg->wire_count = 1; - ++cnt.v_wire_count; + } + vm_page_wire(ptdpg); ptdpg->flags &= ~(PG_MAPPED|PG_BUSY); /* not mapped normally */ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); /* wire in kernel global address entries */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW; pmap->pm_count = 1; - TAILQ_INIT(&pmap->pm_pvlist.pv_list); } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { int s; unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ s = splvm(); if (p->flags & PG_BUSY) { p->flags |= PG_WANTED; tsleep(p, PVM, "pmaprl", 0); splx(s); return 0; } /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; --pmap->pm_stats.resident_count; if (p->hold_count) { + int *kvap; + int i; +#if defined(PMAP_DIAGNOSTIC) panic("pmap_release: freeing held page table page"); +#else + printf("pmap_release: freeing held page table page:\n"); +#endif + kvap = (int *)vm_pager_map_page(p); + for(i=0;ipindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } vm_page_free_zero(p); splx(s); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; int ptepindex; { - vm_offset_t ptepa; + vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ retry: m = vm_page_lookup(pmap->pm_pteobj, ptepindex); if (m == NULL) { - m = pmap_page_alloc(pmap->pm_pteobj, ptepindex); - if (m == NULL) + m = vm_page_alloc(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO); + if (m == NULL) { + VM_WAIT; goto retry; + } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->flags &= ~(PG_ZERO|PG_BUSY); m->valid = VM_PAGE_BITS_ALL; } else { if ((m->flags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED; tsleep(m, PVM, "ptewai", 0); goto retry; } } + /* + * mark the object writeable + */ + pmap->pm_pteobj->flags |= OBJ_WRITEABLE; + if (m->queue != PQ_NONE) { int s = splvm(); - vm_page_unqueue(m,1); + vm_page_unqueue(m); splx(s); } - if (m->wire_count == 0) - ++cnt.v_wire_count; - ++m->wire_count; - + if (m->hold_count == 0) { + if (m->wire_count == 0) + ++cnt.v_wire_count; + ++m->wire_count; + } /* * Increment the hold count for the page table page * (denoting a new mapping.) */ ++m->hold_count; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V); + pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); + pmap_update_1pg(pteva); m->flags |= PG_MAPPED; return m; } -PMAP_INLINE static vm_page_t +static __inline vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { int ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { - m = vm_page_lookup( pmap->pm_pteobj, ptepindex); + m = PHYS_TO_VM_PAGE(ptepa); ++m->hold_count; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); ptdpg = NULL; retry: for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } if (!pmap_release_free_page(pmap, p)) goto retry; } if (ptdpg == NULL) panic("pmap_release: missing page table directory page"); if (!pmap_release_free_page(pmap, ptdpg)) goto retry; vm_object_deallocate(object); if (pdstackptr < PDSTACKMAX) { pdstack[pdstackptr] = (vm_offset_t) pmap->pm_pdir; ++pdstackptr; } else { kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE); } } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); ++nkpt; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } ++nkpt; if (!nkpg) { nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); pmap_zero_page(VM_PAGE_TO_PHYS(nkpg)); } pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_RW); nkpg = NULL; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); } } *pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); free((caddr_t) pmap, M_VMPMAP); } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ -static PMAP_INLINE void +static __inline void free_pv_entry(pv) pv_entry_t pv; { ++pv_freelistcnt; - TAILQ_INSERT_HEAD(&pv_freelist, pv, pv_list); + pv->pv_next = pv_freelist; + pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ -static PMAP_INLINE pv_entry_t +static __inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ - if (pv_freelistcnt < PV_FREELIST_MIN) { + if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } - /* * get a pv_entry off of the free list */ --pv_freelistcnt; - tmp = TAILQ_FIRST(&pv_freelist); - TAILQ_REMOVE(&pv_freelist, tmp, pv_list); + tmp = pv_freelist; + pv_freelist = tmp->pv_next; return tmp; } /* * This *strange* allocation routine eliminates the possibility of a malloc * failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... * We really need to do the slab allocator thingie here. */ static void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * allocate a physical page out of the vm system */ m = vm_page_alloc(kernel_object, OFF_TO_IDX(pvva - vm_map_min(kernel_map)), VM_ALLOC_INTERRUPT); if (m) { int newentries; int i; pv_entry_t entry; newentries = (PAGE_SIZE / sizeof(struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ pvva += PAGE_SIZE; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } } - if (TAILQ_FIRST(&pv_freelist) == NULL) + if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic if * this is too small.) */ npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + PAGE_SIZE - 1) / PAGE_SIZE; pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE); /* * get the first batch of entries */ pmap_alloc_pv_entry(); } /* - * This routine unholds page table pages, and if the hold count - * drops to zero, then it decrements the wire count. + * If it is the first entry on the list, it is actually + * in the header and we must copy the following entry up + * to the header. Otherwise we must search the list for + * the entry. In either case we free the now unused entry. */ -static int -pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { - vm_page_unhold(m); - if (m->hold_count == 0) { - vm_offset_t pteva; - /* - * unmap the page table page - */ - pmap->pm_pdir[m->pindex] = 0; - --pmap->pm_stats.resident_count; - if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == - (((unsigned) PTDpde) & PG_FRAME)) { - /* - * Do a pmap_update to make the invalidated mapping - * take effect immediately. - */ - pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); - pmap_update_1pg(pteva); +static __inline int +pmap_remove_entry(pmap, ppv, va) + struct pmap *pmap; + pv_entry_t *ppv; + vm_offset_t va; +{ + pv_entry_t npv; + int s; + + s = splvm(); + for (npv = *ppv; npv; (ppv = &npv->pv_next, npv = *ppv)) { + if (pmap == npv->pv_pmap && va == npv->pv_va) { + int rtval = pmap_unuse_pt(pmap, va, npv->pv_ptem); + *ppv = npv->pv_next; + free_pv_entry(npv); + splx(s); + return rtval; } - /* - * If the page is finally unwired, simply free it. - */ - --m->wire_count; - if (m->wire_count == 0) { - vm_page_free_zero(m); - --cnt.v_wire_count; - } - return 1; } + splx(s); return 0; } /* - * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. + * Create a pv entry for page at pa for + * (pmap, va). */ -PMAP_INLINE int -pmap_unuse_pt(pmap, va, mpte) +static __inline void +pmap_insert_entry(pmap, va, mpte, pa) pmap_t pmap; vm_offset_t va; vm_page_t mpte; + vm_offset_t pa; { - int ptepindex; - if (va >= UPT_MIN_ADDRESS) - return 0; - if (mpte == NULL) { - ptepindex = (va >> PDRSHIFT); - mpte = vm_page_lookup( pmap->pm_pteobj, ptepindex); - } - - return pmap_unwire_pte_hold(pmap, mpte); -} - -/* - * If it is the first entry on the list, it is actually - * in the header and we must copy the following entry up - * to the header. Otherwise we must search the list for - * the entry. In either case we free the now unused entry. - */ -static int -pmap_remove_entry(pmap, ppv, va) - struct pmap *pmap; - pv_table_t *ppv; - vm_offset_t va; -{ - pv_entry_t pv; - int rtval; int s; + pv_entry_t *ppv, pv; s = splvm(); - if (ppv->pv_list_count < pmap->pm_stats.resident_count) { - for (pv = TAILQ_FIRST(&ppv->pv_list); - pv; - pv = TAILQ_NEXT(pv, pv_list)) { - if (pmap == pv->pv_pmap && va == pv->pv_va) - break; - } - } else { - for (pv = TAILQ_FIRST(&pmap->pm_pvlist.pv_list); - pv; - pv = TAILQ_NEXT(pv, pv_plist)) { - if (va == pv->pv_va) - break; - } - } + pv = get_pv_entry(); + pv->pv_va = va; + pv->pv_pmap = pmap; + pv->pv_ptem = mpte; - rtval = 0; - if (pv) { - rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); - TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); - --ppv->pv_list_count; - TAILQ_REMOVE(&pmap->pm_pvlist.pv_list, pv, pv_plist); - free_pv_entry(pv); - } - + ppv = pa_to_pvh(pa); + if (*ppv) + pv->pv_next = *ppv; + else + pv->pv_next = NULL; + *ppv = pv; splx(s); - return rtval; } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; + pv_entry_t *ppv; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf("pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, (int) oldpte); } #endif if (pmap_track_modified(va)) PHYS_TO_VM_PAGE(oldpte)->dirty = VM_PAGE_BITS_ALL; } - return pmap_remove_entry(pmap, pa_to_pvh(oldpte), va); + ppv = pa_to_pvh(oldpte); + return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); pmap_update_1pg(va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; vm_page_t mpte; int anyvalid; + vm_offset_t vachanged[VATRACK]; if (pmap == NULL) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE) == eva) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); - mpte = NULL; for (; sindex < eindex; sindex = pdnxt) { /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); ptpaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sindex)); /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; + if (sindex < i386_btop(UPT_MIN_ADDRESS)) { /* + * get the vm_page_t for the page table page + */ + mpte = PHYS_TO_VM_PAGE(ptpaddr); + + /* + * if the pte isn't wired, just skip it. + */ + if (mpte->wire_count == 0) + continue; + } + + /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); + if (anyvalid < VATRACK) + vachanged[anyvalid] = va; anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) { - pmap_update(); + if (anyvalid <= VATRACK) { + int i; + for(i=0;ipv_list); - pv; - pv = npv) { + for (pv = *ppv; pv; pv=pv->pv_next) { pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (tpte = *pte) { pv->pv_pmap->pm_stats.resident_count--; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ - if ((tpte & (PG_M|PG_MANAGED)) == (PG_M|PG_MANAGED)) { + if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf("pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte); } #endif - if (pmap_track_modified(pv->pv_va)) - nmodify += 1; + if (pmap_track_modified(pv->pv_va)) { + if (m == NULL) + m = PHYS_TO_VM_PAGE(pa); + m->dirty = VM_PAGE_BITS_ALL; + } } } - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist.pv_list, pv, pv_plist); + } - npv = TAILQ_NEXT(pv, pv_list); - TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); - --ppv->pv_list_count; + for (pv = *ppv; pv; pv = npv) { + npv = pv->pv_next; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } - + *ppv = NULL; splx(s); - return nmodify; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; vm_page_t mpte; int anyvalid; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anyvalid = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); - mpte = NULL; for (; sindex < eindex; sindex = pdnxt) { pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); ptpaddr = (vm_offset_t) *pmap_pde(pmap, i386_ptob(sindex)); /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; + /* + * Skip page ranges, where the page table page isn't wired. + * If the page table page is not wired, there are no page mappings + * there. + */ + if (sindex < i386_btop(UPT_MIN_ADDRESS)) { + mpte = PHYS_TO_VM_PAGE(ptpaddr); + + if (mpte->wire_count == 0) + continue; + } + if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits = ptbase[sindex]; - if ((pbits & (PG_RW|PG_V)) == (PG_RW|PG_V)) { - if ((pbits & (PG_M|PG_MANAGED)) == (PG_M|PG_MANAGED)) { + if (pbits & PG_RW) { + if (pbits & PG_M) { vm_offset_t sva = i386_ptob(sindex); if (pmap_track_modified(sva)) { vm_page_t m = PHYS_TO_VM_PAGE(pbits); m->dirty = VM_PAGE_BITS_ALL; } } ptbase[sindex] = pbits & ~(PG_M|PG_RW); anyvalid = 1; } } } if (anyvalid) pmap_update(); } /* - * Create a pv entry for page at pa for - * (pmap, va). - */ -static void -pmap_insert_entry(pmap, va, mpte, pa) - pmap_t pmap; - vm_offset_t va; - vm_page_t mpte; - vm_offset_t pa; -{ - - int s; - pv_entry_t pv; - pv_table_t *ppv; - - s = splvm(); - pv = get_pv_entry(); - pv->pv_va = va; - pv->pv_pmap = pmap; - pv->pv_ptem = mpte; - - TAILQ_INSERT_TAIL(&pmap->pm_pvlist.pv_list, pv, pv_plist); - - ppv = pa_to_pvh(pa); - TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); - ++ppv->pv_list_count; - - splx(s); -} - -/* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) mpte = pmap_allocpte(pmap, va); - pte = pmap_pte(pmap, va); + pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%lx\n", pmap->pm_pdir[PTDPTDI], va); } origpte = *(vm_offset_t *)pte; pa &= PG_FRAME; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ - if (origpte && (opa == pa)) { + if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf("pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte); } #endif /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { - if ((origpte & PG_M) && pmap_track_modified(va)) { - vm_page_t m; - m = PHYS_TO_VM_PAGE(pa); - m->dirty = VM_PAGE_BITS_ALL; + vm_page_t m; + if (origpte & PG_M) { + if (pmap_track_modified(va)) { + m = PHYS_TO_VM_PAGE(pa); + m->dirty = VM_PAGE_BITS_ALL; + } } pa |= PG_MANAGED; } if (mpte) --mpte->hold_count; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ - if (origpte) { + if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte; if (origpte) pmap_update_1pg(va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ -static vm_page_t -pmap_enter_quick(pmap, va, pa, mpte) +static void +pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; - vm_page_t mpte; { register unsigned *pte; + vm_page_t mpte; + mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ - if (va < UPT_MIN_ADDRESS) { - int ptepindex; - vm_offset_t ptepa; + if (va < UPT_MIN_ADDRESS) + mpte = pmap_allocpte(pmap, va); - /* - * Calculate pagetable page index - */ - ptepindex = va >> PDRSHIFT; - if (mpte && (mpte->pindex == ptepindex)) { - ++mpte->hold_count; - } else { - /* - * Get the page directory entry - */ - ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; - - /* - * If the page table page is mapped, we just increment - * the hold count, and activate it. - */ - if (ptepa) { - mpte = vm_page_lookup( pmap->pm_pteobj, ptepindex); - ++mpte->hold_count; - } else { - mpte = _pmap_allocpte(pmap, ptepindex); - } - } - } else { - mpte = NULL; - } - /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); - return NULL; + return; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pa | PG_V | PG_U | PG_MANAGED; - return mpte; + return; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; - vm_page_t p, mpte; + vm_page_t p; int objpgs; psize = i386_btop(size); if (!pmap || (object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; - mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if (p->queue == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; - mpte = pmap_enter_quick(pmap, + pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), - VM_PAGE_TO_PHYS(p), mpte); + VM_PAGE_TO_PHYS(p)); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if (p->queue == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; - mpte = pmap_enter_quick(pmap, + pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), - VM_PAGE_TO_PHYS(p), mpte); + VM_PAGE_TO_PHYS(p)); p->flags |= PG_MAPPED; PAGE_WAKEUP(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 2 #define PFFOR 2 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry, object) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; vm_object_t object; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; - vm_page_t m, mpte; + vm_page_t m; if (entry->object.vm_object != object) return; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } - mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if (m->queue == PQ_CACHE) { vm_page_deactivate(m); } m->flags |= PG_BUSY; - mpte = pmap_enter_quick(pmap, addr, - VM_PAGE_TO_PHYS(m), mpte); + pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m)); m->flags |= PG_MAPPED; PAGE_WAKEUP(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } + + /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) return; dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); pmap_update(); } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; - int ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); - pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); - ptepindex = addr >> PDRSHIFT; - - srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; - if (srcptepaddr == 0) + srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[addr >> PDRSHIFT]; + if (srcptepaddr == 0) { continue; + } - srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); + srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); if (srcmpte->hold_count == 0) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* - * Clear the modified and - * accessed (referenced) bits - * during the copy. + * Simply clear the modified and accessed (referenced) + * bits. */ *dst_pte = ptetemp & ~(PG_M|PG_A); dst_pmap->pm_stats.resident_count++; - pmap_insert_entry(dst_pmap, addr, - dstmpte, + pmap_insert_entry(dst_pmap, addr, dstmpte, (ptetemp & PG_FRAME)); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; ++src_pte; ++dst_pte; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME); bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; pmap_update_1pg((vm_offset_t) CADDR2); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *) CMAP1 = PG_V | PG_RW | (src & PG_FRAME); *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME); bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { - register pv_entry_t pv; - pv_table_t *ppv; + register pv_entry_t *ppv, pv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ - for (pv = TAILQ_FIRST(&ppv->pv_list); - pv; - pv = TAILQ_NEXT(pv, pv_list)) { + for (pv = *ppv; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } -#define PMAP_REMOVE_PAGES_CURPROC_ONLY /* - * Remove all pages from specified address space - * this aids process exit speeds. Also, this code - * is special cased for current process only. + * pmap_testbit tests bits in pte's + * note that the testbit/changebit routines are inline, + * and a lot of things compile-time evaluate. */ -void -pmap_remove_pages(pmap, sva, eva) - pmap_t pmap; - vm_offset_t sva, eva; +static __inline boolean_t +pmap_testbit(pa, bit) + register vm_offset_t pa; + int bit; { - unsigned *pte, tpte; - pv_table_t *ppv; - pv_entry_t pv, npv; + register pv_entry_t *ppv, pv; + unsigned *pte; int s; -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY - if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { - printf("warning: pmap_remove_pages called with non-current pmap\n"); - return; - } -#endif + if (!pmap_is_managed(pa)) + return FALSE; + ppv = pa_to_pvh(pa); + if (*ppv == NULL) + return FALSE; + s = splvm(); + /* + * Not found, check current mappings returning immediately if found. + */ + for (pv = *ppv ;pv; pv = pv->pv_next) { - for(pv = TAILQ_FIRST(&pmap->pm_pvlist.pv_list); - pv; - pv = npv) { + /* + * if the bit being tested is the modified bit, then + * mark clean_map and ptes as never + * modified. + */ + if (bit & (PG_A|PG_M)) { + if (!pmap_track_modified(pv->pv_va)) + continue; + } - if (pv->pv_va >= eva || pv->pv_va < sva) { - npv = TAILQ_NEXT(pv, pv_plist); + if (!pv->pv_pmap) { +#if defined(PMAP_DIAGNOSTIC) + printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); +#endif continue; } - -#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY - pte = (unsigned *)vtopte(pv->pv_va); -#else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); -#endif - tpte = *pte; - *pte = 0; - - if (tpte) { - pv->pv_pmap->pm_stats.resident_count--; - if (tpte & PG_W) - pv->pv_pmap->pm_stats.wired_count--; - /* - * Update the vm_page_t clean and reference bits. - */ - if (tpte & PG_M) { - PHYS_TO_VM_PAGE(tpte)->dirty = VM_PAGE_BITS_ALL; - } + if (pte == NULL) + continue; + if (*pte & bit) { + splx(s); + return TRUE; } + } + splx(s); + return (FALSE); +} - npv = TAILQ_NEXT(pv, pv_plist); - TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist.pv_list, pv, pv_plist); +/* + * this routine is used to modify bits in ptes + */ +static __inline void +pmap_changebit(pa, bit, setem) + vm_offset_t pa; + int bit; + boolean_t setem; +{ + register pv_entry_t pv, *ppv; + register unsigned *pte; + vm_offset_t va; + int changed; + int s; - ppv = pa_to_pvh(tpte); - --ppv->pv_list_count; - TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); + if (!pmap_is_managed(pa)) + return; - pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); - free_pv_entry(pv); + s = splvm(); + changed = 0; + ppv = pa_to_pvh(pa); + /* + * Loop over all current mappings setting/clearing as appropos If + * setting RO do we need to clear the VAC? + */ + for ( pv = *ppv; pv; pv = pv->pv_next) { + va = pv->pv_va; + + /* + * don't write protect pager mappings + */ + if (!setem && (bit == PG_RW)) { + if (va >= clean_sva && va < clean_eva) + continue; + } + if (!pv->pv_pmap) { +#if defined(PMAP_DIAGNOSTIC) + printf("Null pmap (cb) at va: 0x%lx\n", va); +#endif + continue; + } + + pte = pmap_pte_quick(pv->pv_pmap, va); + if (pte == NULL) + continue; + if (setem) { + *(int *)pte |= bit; + changed = 1; + } else { + vm_offset_t pbits = *(vm_offset_t *)pte; + if (pbits & bit) + changed = 1; + if (bit == PG_RW) { + if (pbits & PG_M) { + vm_page_t m; + vm_offset_t pa = pbits & PG_FRAME; + m = PHYS_TO_VM_PAGE(pa); + m->dirty = VM_PAGE_BITS_ALL; + } + *(int *)pte = pbits & ~(PG_M|PG_RW); + } else { + *(int *)pte = pbits & ~bit; + } + } } - pmap_update(); splx(s); + if (changed) + pmap_update(); } +/* + * pmap_page_protect: + * + * Lower the permission for all mappings to a given page. + */ +void +pmap_page_protect(phys, prot) + vm_offset_t phys; + vm_prot_t prot; +{ + if ((prot & VM_PROT_WRITE) == 0) { + if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { + pmap_changebit(phys, PG_RW, FALSE); + } else { + pmap_remove_all(phys); + pmap_update(); + } + } +} + vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* - * pmap_tcbit: + * pmap_is_referenced: * - * Return the count of bits for a page, clearing all of them. + * Return whether or not the specified physical page was referenced + * by any physical maps. + */ +boolean_t +pmap_is_referenced(vm_offset_t pa) +{ + register pv_entry_t *ppv, pv, lpv; + unsigned *pte; + int s; + + if (!pmap_is_managed(pa)) + return FALSE; + + ppv = pa_to_pvh(pa); + + s = splvm(); + /* + * Not found, check current mappings returning immediately if found. + */ + for (lpv = NULL, pv = *ppv ;pv; lpv = pv, pv = pv->pv_next) { + /* + * if the bit being tested is the modified bit, then + * mark clean_map and ptes as never + * modified. + */ + if (!pmap_track_modified(pv->pv_va)) + continue; + if (!pv->pv_pmap) { + continue; + } + pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); + if (pte == NULL) + continue; + if ((int) *pte & PG_A) { + if (lpv) { + lpv->pv_next = pv->pv_next; + pv->pv_next = *ppv; + *ppv = pv; + } + splx(s); + return TRUE; + } + } + splx(s); + return (FALSE); +} + +/* + * pmap_ts_referenced: + * + * Return the count of reference bits for a page, clearing all of them. * */ int -pmap_tcbit(vm_offset_t pa, int bit) +pmap_ts_referenced(vm_offset_t pa) { - register pv_entry_t pv, npv; - pv_table_t *ppv; + register pv_entry_t *ppv, pv; unsigned *pte; int s; int rtval = 0; + vm_offset_t vachanged[VATRACK]; + if (!pmap_is_managed(pa)) + return FALSE; + s = splvm(); ppv = pa_to_pvh(pa); + + if (*ppv == NULL) { + splx(s); + return 0; + } + /* * Not found, check current mappings returning immediately if found. */ - for (pv = TAILQ_FIRST(&ppv->pv_list); - pv; - pv = npv) { - npv = TAILQ_NEXT(pv, pv_list); + for (pv = *ppv ;pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ - if (((bit & PG_M) != 0) - && !pmap_track_modified(pv->pv_va)) + if (!pmap_track_modified(pv->pv_va)) continue; + if (!pv->pv_pmap) { + continue; + } pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte == NULL) continue; - - if ((rtval == 0) && (*pte & bit)) { - rtval = 1; + if (*pte & PG_A) { + if (rtval < VATRACK) + vachanged[rtval] = pv->pv_va; + rtval++; + *pte &= ~PG_A; } - *pte &= ~bit; } splx(s); - if (rtval) - pmap_update(); + if (rtval) { + if (rtval <= VATRACK) { + int i; + for(i=0;idirty = VM_PAGE_BITS_ALL; - - return rtval; + return pmap_testbit((pa), PG_M); } /* - * pmap_tc_referenced: - * - * Return the count of referenced bits for a page, clearing all of them. - * + * Clear the modify bits on the specified physical page. */ -int -pmap_tc_referenced(vm_offset_t pa) +void +pmap_clear_modify(vm_offset_t pa) { - if (!pmap_is_managed(pa)) - return 0; - return pmap_tcbit(pa, PG_A); + pmap_changebit((pa), PG_M, FALSE); } /* - * pmap_page_protect: + * pmap_clear_reference: * - * Lower the permission for all mappings to a given page. + * Clear the reference bit on the specified physical page. */ void -pmap_page_protect(m, prot) - vm_page_t m; - vm_prot_t prot; +pmap_clear_reference(vm_offset_t pa) { - if ((prot & VM_PROT_WRITE) == 0) { - if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { - if ((m->flags & PG_FICTITIOUS) == 0) - pmap_tcbit(VM_PAGE_TO_PHYS(m), PG_RW); - } else { - if (pmap_remove_all(VM_PAGE_TO_PHYS(m))) { - m->dirty = VM_PAGE_BITS_ALL; - } - pmap_update(); - } - } + pmap_changebit((pa), PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. The non-cacheable bits are set on each * mapped page. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; unsigned *pte; size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | PG_N; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_update(); return ((void *) va); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; int val = 0; - ptep = pmap_pte_quick(pmap, addr); + ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if (pte = *ptep) { vm_offset_t pa; val = MINCORE_INCORE; pa = pte & PG_FRAME; /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (PHYS_TO_VM_PAGE(pa)->dirty || - pmap_tcbit(pa, PG_M)) { + pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; - PHYS_TO_VM_PAGE(pa)->dirty = VM_PAGE_BITS_ALL; - } /* * Referenced by us */ if (pte & PG_U) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((PHYS_TO_VM_PAGE(pa)->flags & PG_REFERENCED) || - pmap_tcbit(pa, PG_A)) + pmap_is_referenced(pa)) val |= MINCORE_REFERENCED_OTHER; } return val; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } - pte = pmap_pte_quick( pmap, va); + pte = pmap_pte( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; - ptep = pmap_pte_quick(pm, va); + ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); - for (pv = TAILQ_FIRST(pa_to_pvh(pa)); - pv; - pv = TAILQ_NEXT(pv, pv_list)) { + for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/include/pmap.h =================================================================== --- head/sys/i386/include/pmap.h (revision 17333) +++ head/sys/i386/include/pmap.h (revision 17334) @@ -1,237 +1,224 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 - * $Id: pmap.h,v 1.40 1996/06/08 11:21:19 bde Exp $ + * $Id: pmap.h,v 1.41 1996/07/27 03:23:32 dyson Exp $ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ - /* * Page-directory and page-table entires follow this format, with a few * of the fields not present here and there, depending on a lot of things. */ /* ---- Intel Nomenclature ---- */ #define PG_V 0x001 /* P Valid */ #define PG_RW 0x002 /* R/W Read/Write */ #define PG_U 0x004 /* U/S User/Supervisor */ #define PG_NC_PWT 0x008 /* PWT Write through */ #define PG_NC_PCD 0x010 /* PCD Cache disable */ #define PG_A 0x020 /* A Accessed */ #define PG_M 0x040 /* D Dirty */ #define PG_PS 0x080 /* PS Page size (0=4k,1=4M) */ #define PG_G 0x100 /* G Global */ #define PG_AVAIL1 0x200 /* / Available for system */ #define PG_AVAIL2 0x400 /* < programmers use */ #define PG_AVAIL3 0x800 /* \ */ /* Our various interpretations of the above */ #define PG_W PG_AVAIL1 /* "Wired" pseudoflag */ #define PG_MANAGED PG_AVAIL2 #define PG_FRAME (~PAGE_MASK) #define PG_PROT (PG_RW|PG_U) /* all protection bits . */ #define PG_N (PG_NC_PWT|PG_NC_PCD) /* Non-cacheable */ /* * Page Protection Exception bits */ #define PGEX_P 0x01 /* Protection violation vs. not present */ #define PGEX_W 0x02 /* during a Write cycle */ #define PGEX_U 0x04 /* access from User mode (UPL) */ /* * Pte related macros */ #define VADDR(pdi, pti) ((vm_offset_t)(((pdi)< - typedef unsigned int *pd_entry_t; typedef unsigned int *pt_entry_t; #define PDESIZE sizeof(pd_entry_t) /* for assembly files */ #define PTESIZE sizeof(pt_entry_t) /* for assembly files */ /* * Address of current and alternate address space page table maps * and directories. */ #ifdef KERNEL extern pt_entry_t PTmap[], APTmap[], Upte; extern pd_entry_t PTD[], APTD[], PTDpde, APTDpde, Upde; extern int IdlePTD; /* physical address of "Idle" state directory */ #endif /* * virtual address to page table entry and * to physical address. Likewise for alternate address space. * Note: these work recursively, thus vtopte of a pte will give * the corresponding pde that in turn maps it. */ #define vtopte(va) (PTmap + i386_btop(va)) #define vtophys(va) (((int) (*vtopte(va))&PG_FRAME) | ((int)(va) & PAGE_MASK)) #define avtopte(va) (APTmap + i386_btop(va)) #define avtophys(va) (((int) (*avtopte(va))&PG_FRAME) | ((int)(va) & PAGE_MASK)) #ifdef KERNEL /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * kernel virtual address. */ static __inline vm_offset_t pmap_kextract(vm_offset_t va) { vm_offset_t pa = *(int *)vtopte(va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); return pa; } #endif -struct vm_page; - /* * Pmap stuff */ -struct pv_entry; -typedef struct { - int pv_list_count; - TAILQ_HEAD(,pv_entry) pv_list; -} pv_table_t; struct pmap { pd_entry_t *pm_pdir; /* KVA of page directory */ vm_object_t pm_pteobj; /* Container for pte's */ - pv_table_t pm_pvlist; /* list of mappings in pmap */ - int pm_count; /* reference count */ + short pm_dref; /* page directory ref count */ + short pm_count; /* pmap reference count */ struct pmap_statistics pm_stats; /* pmap statistics */ - struct vm_page *pm_ptphint; /* pmap ptp hint */ + struct vm_map *pm_map; /* map that owns this pmap */ }; typedef struct pmap *pmap_t; #ifdef KERNEL extern pmap_t kernel_pmap; #endif - /* * For each vm_page_t, there is a list of all currently valid virtual * mappings of that page. An entry is a pv_entry_t, the list is pv_table. */ typedef struct pv_entry { + struct pv_entry *pv_next; /* next pv_entry */ pmap_t pv_pmap; /* pmap where mapping lies */ vm_offset_t pv_va; /* virtual address for mapping */ - TAILQ_ENTRY(pv_entry) pv_list; - TAILQ_ENTRY(pv_entry) pv_plist; vm_page_t pv_ptem; /* VM page for pte */ } *pv_entry_t; #define PV_ENTRY_NULL ((pv_entry_t) 0) #define PV_CI 0x01 /* all entries must be cache inhibited */ #define PV_PTPAGE 0x02 /* entry maps a page table page */ #ifdef KERNEL extern caddr_t CADDR1; extern pt_entry_t *CMAP1; extern vm_offset_t avail_end; extern vm_offset_t avail_start; extern vm_offset_t phys_avail[]; -pv_table_t *pv_table; +extern pv_entry_t *pv_table; /* array of entries, one per page */ extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; #define pa_index(pa) atop(pa - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define pmap_resident_count(pmap) ((pmap)->pm_stats.resident_count) struct pcb; void pmap_bootstrap __P(( vm_offset_t, vm_offset_t)); pmap_t pmap_kernel __P((void)); void *pmap_mapdev __P((vm_offset_t, vm_size_t)); unsigned * __pure pmap_pte __P((pmap_t, vm_offset_t)) __pure2; int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); vm_page_t pmap_use_pt __P((pmap_t, vm_offset_t)); #endif /* KERNEL */ #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 17333) +++ head/sys/kern/kern_exec.c (revision 17334) @@ -1,616 +1,616 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: kern_exec.c,v 1.44 1996/07/12 04:11:37 bde Exp $ + * $Id: kern_exec.c,v 1.45 1996/07/27 03:23:41 dyson Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int *exec_copyout_strings __P((struct image_params *)); static int exec_check_permissions(struct image_params *); /* * XXX trouble here if sizeof(caddr_t) != sizeof(int), other parts * of the sysctl code also assumes this, and sizeof(int) == sizeof(long). */ static struct ps_strings *ps_strings = PS_STRINGS; SYSCTL_INT(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, ""); static caddr_t usrstack = (caddr_t)USRSTACK; SYSCTL_INT(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, ""); /* * execsw_set is constructed for us by the linker. Each of the items * is a pointer to a `const struct execsw', hence the double pointer here. */ static const struct execsw **execsw = (const struct execsw **)&execsw_set.ls_items[0]; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif /* * execve() system call. */ int execve(p, uap, retval) struct proc *p; register struct execve_args *uap; int *retval; { struct nameidata nd, *ndp; int *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; imgp = &image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->uap = uap; imgp->attr = &attr; imgp->image_header = NULL; imgp->argc = imgp->envc = 0; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name[0] = '\0'; imgp->auxargs = NULL; /* * Allocate temporary demand zeroed space for argument and * environment strings */ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX); if (imgp->stringbase == NULL) { error = ENOMEM; goto exec_fail; } imgp->stringp = imgp->stringbase; imgp->stringspace = ARG_MAX; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. */ ndp = &nd; NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_USERSPACE, uap->fname, p); interpret: error = namei(ndp); if (error) { kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); goto exec_fail; } imgp->vp = ndp->ni_vp; if (imgp->vp == NULL) { error = ENOEXEC; goto exec_fail_dealloc; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); /* * Lose the lock on the vnode. It's no longer needed, and must not * exist for the pagefault paging to work below. */ VOP_UNLOCK(imgp->vp); if (error) goto exec_fail_dealloc; /* * Map the image header (first page) of the file into * kernel address space */ error = vm_mmap(exech_map, /* map */ (vm_offset_t *)&imgp->image_header, /* address */ PAGE_SIZE, /* size */ VM_PROT_READ, /* protection */ VM_PROT_READ, /* max protection */ 0, /* flags */ (caddr_t)imgp->vp, /* vnode */ 0); /* offset */ if (error) { uprintf("mmap failed: %d\n",error); goto exec_fail_dealloc; } /* * Loop through list of image activators, calling each one. * If there is no match, the activator returns -1. If there * is a match, but there was an error during the activation, * the error is returned. Otherwise 0 means success. If the * image is interpreted, loop back up and try activating * the interpreter. */ for (i = 0; execsw[i]; ++i) { if (execsw[i]->ex_imgact) error = (*execsw[i]->ex_imgact)(imgp); else continue; if (error == -1) continue; if (error) goto exec_fail_dealloc; if (imgp->interpreted) { /* free old vnode and name buffer */ vrele(ndp->ni_vp); FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header, (vm_offset_t)imgp->image_header + PAGE_SIZE)) panic("execve: header dealloc failed (1)"); /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, p); goto interpret; } break; } /* If we made it through all the activators and none matched, exit. */ if (error == -1) { error = ENOEXEC; goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base */ stack_base = exec_copyout_strings(imgp); p->p_vmspace->vm_minsaddr = (char *)stack_base; /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->argc); /* close files on exec */ fdcloseexec(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has it's own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup((caddr_t)p->p_pptr); } /* * Implement image setuid/setgid. Disallow if the process is * being traced. */ if ((attr.va_mode & (VSUID | VSGID)) && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. */ if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) { p->p_traceflag = 0; vrele(p->p_tracep); p->p_tracep = NULL; } /* * Set the new credentials. */ p->p_ucred = crcopy(p->p_ucred); if (attr.va_mode & VSUID) p->p_ucred->cr_uid = attr.va_uid; if (attr.va_mode & VSGID) p->p_ucred->cr_groups[0] = attr.va_gid; p->p_flag |= P_SUGID; } else { p->p_flag &= ~P_SUGID; } /* * Implement correct POSIX saved-id behavior. */ p->p_cred->p_svuid = p->p_ucred->cr_uid; p->p_cred->p_svgid = p->p_ucred->cr_gid; /* * Store the vp for use in procfs */ if (p->p_textvp) /* release old reference */ vrele(p->p_textvp); VREF(ndp->ni_vp); p->p_textvp = ndp->ni_vp; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. */ if (p->p_flag & P_TRACED) psignal(p, SIGTRAP); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* Set entry address */ setregs(p, imgp->entry_addr, (u_long)stack_base); /* * free various allocated resources */ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header, (vm_offset_t)imgp->image_header + PAGE_SIZE)) panic("execve: header dealloc failed (2)"); vrele(ndp->ni_vp); FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); return (0); exec_fail_dealloc: if (imgp->stringbase != NULL) kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); if (imgp->image_header && imgp->image_header != (char *)-1) if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header, (vm_offset_t)imgp->image_header + PAGE_SIZE)) panic("execve: header dealloc failed (3)"); if (ndp->ni_vp) vrele(ndp->ni_vp); FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); exec_fail: if (imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(p, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ return(0); } else { return(error); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp) struct image_params *imgp; { int error; struct vmspace *vmspace = imgp->proc->p_vmspace; caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ); imgp->vmspace_destroyed = 1; /* Blow away entire process VM */ if (vmspace->vm_shm) shmexit(imgp->proc); - vm_map_remove_userspace(&vmspace->vm_map); + vm_map_remove(&vmspace->vm_map, 0, USRSTACK); /* Allocate a new stack */ error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *)&stack_addr, SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return(error); vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT; /* Initialize maximum stack address */ vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ; return(0); } /* * Copy out argument and environment strings from the old process * address space into the temporary string buffer. */ int exec_extract_strings(imgp) struct image_params *imgp; { char **argv, **envv; char *argp, *envp; int error, length; /* * extract arguments first */ argv = imgp->uap->argv; if (argv) { while ((argp = (caddr_t) fuword(argv++))) { if (argp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(argp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->argc++; } } /* * extract environment strings */ envv = imgp->uap->envv; if (envv) { while ((envp = (caddr_t) fuword(envv++))) { if (envp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(envp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->envc++; } } return (0); } /* * Copy strings out to the new process address space, constructing * new arg and env vector tables. Return a pointer to the base * so that it can be used as the initial stack pointer. */ int * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; int *stack_base; struct ps_strings *arginfo; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ arginfo = PS_STRINGS; szsigcode = *(imgp->proc->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(imgp->proc->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) /* * The '+ 2' is for the null pointers at the end of each of the * arg and env vector sets, and 'AT_COUNT*2' is room for the * ELF Auxargs data. */ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + AT_COUNT*2) * sizeof(char*)); else /* * The '+ 2' is for the null pointers at the end of each of the * arg and env vector sets */ vectp = (char **) (destp - (imgp->argc + imgp->envc + 2) * sizeof(char*)); /* * vectp also becomes our initial stack base */ stack_base = (int *)vectp; stringp = imgp->stringbase; argc = imgp->argc; envc = imgp->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (int)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (int)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer seperates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (int)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (int)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Return 0 for success or error code on failure. */ static int exec_check_permissions(imgp) struct image_params *imgp; { struct proc *p = imgp->proc; struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; int error; /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) { return (ETXTBSY); } /* Get file attributes */ error = VOP_GETATTR(vp, attr, p->p_ucred, p); if (error) return (error); /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) { return (EACCES); } /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Disable setuid/setgid if the filesystem prohibits it or if * the process is being traced. */ if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) attr->va_mode &= ~(VSUID | VSGID); /* * Check for execute permission to file based on current credentials. * Then call filesystem specific open routine (which does nothing * in the general case). */ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); if (error) return (error); error = VOP_OPEN(vp, FREAD, p->p_ucred, p); if (error) return (error); return (0); } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 17333) +++ head/sys/kern/kern_exit.c (revision 17334) @@ -1,474 +1,475 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 - * $Id: kern_exit.c,v 1.33 1996/06/12 05:07:28 gpalmer Exp $ + * $Id: kern_exit.c,v 1.34 1996/07/27 03:23:42 dyson Exp $ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #ifdef COMPAT_43 #include #include #endif #include #include #include #include #include #include #include static int wait1 __P((struct proc *, struct wait_args *, int [], int)); /* * exit -- * Death of process. */ __dead void exit(p, uap, retval) struct proc *p; struct rexit_args /* { int rval; } */ *uap; int *retval; { exit1(p, W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state * to zombie, and unlink proc from allproc and parent's lists. Save exit * status and rusage for wait(). Check for child processes and orphan them. */ __dead void exit1(p, rv) register struct proc *p; int rv; { register struct proc *q, *nq; register struct vmspace *vm; if (p->p_pid == 1) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } #ifdef PGINPROF vmsizmon(); #endif if (p->p_flag & P_PROFIL) stopprofclock(p); MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), M_ZOMBIE, M_WAITOK); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ p->p_flag &= ~(P_TRACED | P_PPWAIT); p->p_flag |= P_WEXIT; p->p_sigignore = ~0; p->p_siglist = 0; untimeout(realitexpire, (caddr_t)p); /* * Close open files and release open-file table. * This may block! */ fdfree(p); /* * XXX Shutdown SYSV semaphores */ semexit(p); /* The next two chunks should probably be moved to vmspace_exit. */ vm = p->p_vmspace; if (vm->vm_shm) shmexit(p); /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * Can't free the entire vmspace as the kernel stack * may be mapped within that space also. */ if (vm->vm_refcnt == 1) - vm_map_remove_userspace(&vm->vm_map); + (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); if (SESS_LEADER(p)) { register struct session *sp = p->p_session; if (sp->s_ttyvp) { /* * Controlling process. * Signal foreground pgrp, * drain controlling terminal * and revoke access to controlling terminal. */ if (sp->s_ttyp->t_session == sp) { if (sp->s_ttyp->t_pgrp) pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); (void) ttywait(sp->s_ttyp); /* * The tty could have been revoked * if we blocked. */ if (sp->s_ttyvp) vgoneall(sp->s_ttyvp); } if (sp->s_ttyvp) vrele(sp->s_ttyvp); sp->s_ttyvp = NULL; /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. * (for logging and informational purposes) */ } sp->s_leader = NULL; } fixjobc(p, p->p_pgrp, 0); p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; (void)acct_process(p); #ifdef KTRACE /* * release trace file */ p->p_traceflag = 0; /* don't trace the vrele() */ if (p->p_tracep) vrele(p->p_tracep); #endif /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); p->p_stat = SZOMB; LIST_REMOVE(p, p_hash); q = p->p_children.lh_first; if (q) /* only need this if any child is S_ZOMB */ wakeup((caddr_t) initproc); for (; q != 0; q = nq) { nq = q->p_sibling.le_next; LIST_REMOVE(q, p_sibling); LIST_INSERT_HEAD(&initproc->p_children, q, p_sibling); q->p_pptr = initproc; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { q->p_flag &= ~P_TRACED; psignal(q, SIGKILL); } } /* * Save exit status and final rusage info, adding in child rusage * info and self times. */ p->p_xstat = rv; *p->p_ru = p->p_stats->p_ru; calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); ruadd(p->p_ru, &p->p_stats->p_cru); /* * Notify parent that we're gone. */ psignal(p->p_pptr, SIGCHLD); wakeup((caddr_t)p->p_pptr); #if defined(tahoe) /* move this to cpu_exit */ p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL; #endif /* * Clear curproc after we've done all operations * that could block, and before tearing down the rest * of the process state that might be used from clock, etc. * Also, can't clear curproc while we're still runnable, * as we're not on a run queue (we are current, just not * a proper proc any longer!). * * Other substructures are freed from wait(). */ curproc = NULL; if (--p->p_limit->p_refcnt == 0) { FREE(p->p_limit, M_SUBPROC); p->p_limit = NULL; } /* * Finally, call machine-dependent code to release the remaining * resources including address space, the kernel stack and pcb. * The address space is released by "vmspace_free(p->p_vmspace)"; * This is machine-dependent, as we may have to change stacks * or ensure that the current one isn't reallocated before we * finish. cpu_exit will end with a call to cpu_switch(), finishing * our execution (pun intended). */ cpu_exit(p); } #ifdef COMPAT_43 #if defined(hp300) || defined(luna68k) #include #define GETPS(rp) ((struct frame *)(rp))->f_sr #else #define GETPS(rp) (rp)[PS] #endif int owait(p, uap, retval) struct proc *p; register struct owait_args /* { int dummy; } */ *uap; int *retval; { struct wait_args w; #ifdef PSL_ALLCC if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) { w.options = 0; w.rusage = NULL; } else { w.options = p->p_md.md_regs[R0]; w.rusage = (struct rusage *)p->p_md.md_regs[R1]; } #else w.options = 0; w.rusage = NULL; #endif w.pid = WAIT_ANY; w.status = NULL; return (wait1(p, &w, retval, 1)); } #endif /* COMPAT_43 */ int wait4(p, uap, retval) struct proc *p; struct wait_args *uap; int *retval; { return (wait1(p, uap, retval, 0)); } static int wait1(q, uap, retval, compat) register struct proc *q; register struct wait_args /* { int pid; int *status; int options; struct rusage *rusage; } */ *uap; int retval[]; int compat; { register int nfound; register struct proc *p, *t; int status, error; if (uap->pid == 0) uap->pid = -q->p_pgid; #ifdef notyet if (uap->options &~ (WUNTRACED|WNOHANG)) return (EINVAL); #endif loop: nfound = 0; for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { if (uap->pid != WAIT_ANY && p->p_pid != uap->pid && p->p_pgid != -uap->pid) continue; nfound++; if (p->p_stat == SZOMB) { /* charge childs scheduling cpu usage to parent */ if (curproc->p_pid != 1) { curproc->p_estcpu = min(curproc->p_estcpu + p->p_estcpu, UCHAR_MAX); } retval[0] = p->p_pid; #ifdef COMPAT_43 if (compat) retval[1] = p->p_xstat; else #endif if (uap->status) { status = p->p_xstat; /* convert to int */ if ((error = copyout((caddr_t)&status, (caddr_t)uap->status, sizeof(status)))) return (error); } if (uap->rusage && (error = copyout((caddr_t)p->p_ru, (caddr_t)uap->rusage, sizeof (struct rusage)))) return (error); /* * If we got the child via a ptrace 'attach', * we need to give it back to the old parent. */ if (p->p_oppid && (t = pfind(p->p_oppid))) { p->p_oppid = 0; proc_reparent(p, t); psignal(t, SIGCHLD); wakeup((caddr_t)t); return (0); } p->p_xstat = 0; ruadd(&q->p_stats->p_cru, p->p_ru); FREE(p->p_ru, M_ZOMBIE); p->p_ru = NULL; /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_cred->p_ruid, -1); /* * Release reference to text vnode */ if (p->p_textvp) vrele(p->p_textvp); /* * Free up credentials. */ if (--p->p_cred->p_refcnt == 0) { crfree(p->p_cred->pc_ucred); FREE(p->p_cred, M_SUBPROC); p->p_cred = NULL; } /* * Finally finished with old proc entry. * Unlink it from its process group and free it. */ leavepgrp(p); LIST_REMOVE(p, p_list); /* off zombproc */ LIST_REMOVE(p, p_sibling); /* * Give machine-dependent layer a chance * to free anything that cpu_exit couldn't * release while still running in process context. */ cpu_wait(p); FREE(p, M_PROC); nprocs--; return (0); } if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { p->p_flag |= P_WAITED; retval[0] = p->p_pid; #ifdef COMPAT_43 if (compat) { retval[1] = W_STOPCODE(p->p_xstat); error = 0; } else #endif if (uap->status) { status = W_STOPCODE(p->p_xstat); error = copyout((caddr_t)&status, (caddr_t)uap->status, sizeof(status)); } else error = 0; return (error); } } if (nfound == 0) return (ECHILD); if (uap->options & WNOHANG) { retval[0] = 0; return (0); } if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0))) return (error); goto loop; } /* * make process 'parent' the new parent of process 'child'. */ void proc_reparent(child, parent) register struct proc *child; register struct proc *parent; { if (child->p_pptr == parent) return; LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; } Index: head/sys/vm/pmap.h =================================================================== --- head/sys/vm/pmap.h (revision 17333) +++ head/sys/vm/pmap.h (revision 17334) @@ -1,131 +1,131 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.h 8.1 (Berkeley) 6/11/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Author: Avadis Tevanian, Jr. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: pmap.h,v 1.11 1996/06/17 03:35:34 dyson Exp $ + * $Id: pmap.h,v 1.13 1996/07/27 04:22:12 dyson Exp $ */ /* * Machine address mapping definitions -- machine-independent * section. [For machine-dependent section, see "machine/pmap.h".] */ #ifndef _PMAP_VM_ #define _PMAP_VM_ /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they * so choose, but are expected to return the statistics * in the following structure. */ struct pmap_statistics { long resident_count; /* # of pages mapped (total) */ long wired_count; /* # of pages wired */ }; typedef struct pmap_statistics *pmap_statistics_t; #include #ifdef KERNEL void pmap_change_wiring __P((pmap_t, vm_offset_t, boolean_t)); void pmap_clear_modify __P((vm_offset_t pa)); void pmap_clear_reference __P((vm_offset_t pa)); void pmap_copy __P((pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t)); void pmap_copy_page __P((vm_offset_t, vm_offset_t)); void pmap_destroy __P((pmap_t)); void pmap_enter __P((pmap_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); vm_offset_t pmap_extract __P((pmap_t, vm_offset_t)); void pmap_growkernel __P((vm_offset_t)); void pmap_init __P((vm_offset_t, vm_offset_t)); -int pmap_tc_modified __P((vm_page_t m)); -int pmap_tc_referenced __P((vm_offset_t pa)); +boolean_t pmap_is_modified __P((vm_offset_t pa)); +boolean_t pmap_is_referenced __P((vm_offset_t pa)); +boolean_t pmap_ts_referenced __P((vm_offset_t pa)); void pmap_kenter __P((vm_offset_t, vm_offset_t)); void pmap_kremove __P((vm_offset_t)); vm_offset_t pmap_map __P((vm_offset_t, vm_offset_t, vm_offset_t, int)); void pmap_object_init_pt __P((pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_offset_t size, int pagelimit)); boolean_t pmap_page_exists __P((pmap_t, vm_offset_t)); -void pmap_page_protect __P((vm_page_t, vm_prot_t)); +void pmap_page_protect __P((vm_offset_t, vm_prot_t)); void pmap_pageable __P((pmap_t, vm_offset_t, vm_offset_t, boolean_t)); vm_offset_t pmap_phys_address __P((int)); void pmap_pinit __P((pmap_t)); void pmap_protect __P((pmap_t, vm_offset_t, vm_offset_t, vm_prot_t)); void pmap_qenter __P((vm_offset_t, vm_page_t *, int)); void pmap_qremove __P((vm_offset_t, int)); void pmap_reference __P((pmap_t)); void pmap_release __P((pmap_t)); void pmap_remove __P((pmap_t, vm_offset_t, vm_offset_t)); -void pmap_remove_pages __P((pmap_t, vm_offset_t, vm_offset_t)); void pmap_zero_page __P((vm_offset_t)); void pmap_prefault __P((pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, vm_object_t object)); int pmap_mincore __P((pmap_t pmap, vm_offset_t addr)); #endif /* KERNEL */ #endif /* _PMAP_VM_ */ Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 17333) +++ head/sys/vm/swap_pager.c (revision 17334) @@ -1,1670 +1,1663 @@ /* * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 - * $Id: swap_pager.c,v 1.68 1996/06/10 04:58:48 dyson Exp $ + * $Id: swap_pager.c,v 1.69 1996/07/27 03:23:51 dyson Exp $ */ /* * Quick hack to page to dedicated partition(s). * TODO: * Add multiprocessor locks * Deal with async writes in a better fashion */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef NPENDINGIO #define NPENDINGIO 10 #endif static int nswiodone; int swap_pager_full; extern int vm_swap_size; static int no_swap_space = 1; struct rlisthdr swaplist; #define MAX_PAGEOUT_CLUSTER 16 TAILQ_HEAD(swpclean, swpagerclean); typedef struct swpagerclean *swp_clean_t; static struct swpagerclean { TAILQ_ENTRY(swpagerclean) spc_list; int spc_flags; struct buf *spc_bp; vm_object_t spc_object; vm_offset_t spc_kva; int spc_count; vm_page_t spc_m[MAX_PAGEOUT_CLUSTER]; } swcleanlist[NPENDINGIO]; /* spc_flags values */ #define SPC_ERROR 0x01 #define SWB_EMPTY (-1) /* list of completed page cleans */ static struct swpclean swap_pager_done; /* list of pending page cleans */ static struct swpclean swap_pager_inuse; /* list of free pager clean structs */ static struct swpclean swap_pager_free; int swap_pager_free_count; /* list of "named" anon region objects */ static struct pagerlst swap_pager_object_list; /* list of "unnamed" anon region objects */ struct pagerlst swap_pager_un_object_list; #define SWAP_FREE_NEEDED 0x1 /* need a swap block */ #define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2 static int swap_pager_needflags; static struct pagerlst *swp_qs[] = { &swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0 }; /* * pagerops for OBJT_SWAP - "swap pager". */ static vm_object_t swap_pager_alloc __P((void *handle, vm_size_t size, vm_prot_t prot, vm_ooffset_t offset)); static void swap_pager_dealloc __P((vm_object_t object)); static boolean_t swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex, int *before, int *after)); static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static void swap_pager_init __P((void)); static void swap_pager_sync __P((void)); struct pagerops swappagerops = { swap_pager_init, swap_pager_alloc, swap_pager_dealloc, swap_pager_getpages, swap_pager_putpages, swap_pager_haspage, swap_pager_sync }; static int npendingio = NPENDINGIO; static int dmmin; int dmmax; static __pure int swap_pager_block_index __P((vm_pindex_t pindex)) __pure2; static __pure int swap_pager_block_offset __P((vm_pindex_t pindex)) __pure2; static daddr_t *swap_pager_diskaddr __P((vm_object_t object, vm_pindex_t pindex, int *valid)); static void swap_pager_finish __P((swp_clean_t spc)); static void swap_pager_freepage __P((vm_page_t m)); static void swap_pager_free_swap __P((vm_object_t object)); static void swap_pager_freeswapspace __P((vm_object_t object, unsigned int from, unsigned int to)); static int swap_pager_getswapspace __P((vm_object_t object, unsigned int amount, daddr_t *rtval)); static void swap_pager_iodone __P((struct buf *)); static void swap_pager_iodone1 __P((struct buf *bp)); static void swap_pager_reclaim __P((void)); static void swap_pager_ridpages __P((vm_page_t *m, int count, int reqpage)); static void swap_pager_setvalid __P((vm_object_t object, vm_offset_t offset, int valid)); static void swapsizecheck __P((void)); #define SWAPLOW (vm_swap_size < (512 * btodb(PAGE_SIZE))) static inline void swapsizecheck() { if (vm_swap_size < 128 * btodb(PAGE_SIZE)) { if (swap_pager_full == 0) printf("swap_pager: out of swap space\n"); swap_pager_full = 1; } else if (vm_swap_size > 192 * btodb(PAGE_SIZE)) swap_pager_full = 0; } static void swap_pager_init() { TAILQ_INIT(&swap_pager_object_list); TAILQ_INIT(&swap_pager_un_object_list); /* * Initialize clean lists */ TAILQ_INIT(&swap_pager_inuse); TAILQ_INIT(&swap_pager_done); TAILQ_INIT(&swap_pager_free); swap_pager_free_count = 0; /* * Calculate the swap allocation constants. */ dmmin = PAGE_SIZE / DEV_BSIZE; dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2; } void swap_pager_swap_init() { swp_clean_t spc; struct buf *bp; int i; /* * kva's are allocated here so that we dont need to keep doing * kmem_alloc pageables at runtime */ for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) { spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * MAX_PAGEOUT_CLUSTER); if (!spc->spc_kva) { break; } spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL); if (!spc->spc_bp) { kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE); break; } spc->spc_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); swap_pager_free_count++; } } int swap_pager_swp_alloc(object, wait) vm_object_t object; int wait; { sw_blk_t swb; int nblocks; int i, j; nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES; swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait); if (swb == NULL) return 1; for (i = 0; i < nblocks; i++) { swb[i].swb_valid = 0; swb[i].swb_locked = 0; for (j = 0; j < SWB_NPAGES; j++) swb[i].swb_block[j] = SWB_EMPTY; } object->un_pager.swp.swp_nblocks = nblocks; object->un_pager.swp.swp_allocsize = 0; object->un_pager.swp.swp_blocks = swb; object->un_pager.swp.swp_poip = 0; if (object->handle != NULL) { TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list); } else { TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list); } return 0; } /* * Allocate an object and associated resources. * Note that if we are called from the pageout daemon (handle == NULL) * we should not wait for memory as it could resulting in deadlock. */ static vm_object_t swap_pager_alloc(handle, size, prot, offset) void *handle; register vm_size_t size; vm_prot_t prot; vm_ooffset_t offset; { vm_object_t object; /* * If this is a "named" anonymous region, look it up and use the * object if it exists, otherwise allocate a new one. */ if (handle) { object = vm_pager_object_lookup(&swap_pager_object_list, handle); if (object != NULL) { vm_object_reference(object); } else { /* * XXX - there is a race condition here. Two processes * can request the same named object simultaneuously, * and if one blocks for memory, the result is a disaster. * Probably quite rare, but is yet another reason to just * rip support of "named anonymous regions" out altogether. */ object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK) + size); object->handle = handle; (void) swap_pager_swp_alloc(object, M_WAITOK); } } else { object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK) + size); (void) swap_pager_swp_alloc(object, M_WAITOK); } return (object); } /* * returns disk block associated with pager and offset * additionally, as a side effect returns a flag indicating * if the block has been written */ inline static daddr_t * swap_pager_diskaddr(object, pindex, valid) vm_object_t object; vm_pindex_t pindex; int *valid; { register sw_blk_t swb; int ix; if (valid) *valid = 0; ix = pindex / SWB_NPAGES; if ((ix >= object->un_pager.swp.swp_nblocks) || (pindex >= object->size)) { return (FALSE); } swb = &object->un_pager.swp.swp_blocks[ix]; ix = pindex % SWB_NPAGES; if (valid) *valid = swb->swb_valid & (1 << ix); return &swb->swb_block[ix]; } /* * Utility routine to set the valid (written) bit for * a block associated with a pager and offset */ static void swap_pager_setvalid(object, offset, valid) vm_object_t object; vm_offset_t offset; int valid; { register sw_blk_t swb; int ix; ix = offset / SWB_NPAGES; if (ix >= object->un_pager.swp.swp_nblocks) return; swb = &object->un_pager.swp.swp_blocks[ix]; ix = offset % SWB_NPAGES; if (valid) swb->swb_valid |= (1 << ix); else swb->swb_valid &= ~(1 << ix); return; } /* * this routine allocates swap space with a fragmentation * minimization policy. */ static int swap_pager_getswapspace(object, amount, rtval) vm_object_t object; unsigned int amount; daddr_t *rtval; { unsigned location; vm_swap_size -= amount; if (!rlist_alloc(&swaplist, amount, &location)) { vm_swap_size += amount; return 0; } else { swapsizecheck(); object->un_pager.swp.swp_allocsize += amount; *rtval = location; return 1; } } /* * this routine frees swap space with a fragmentation * minimization policy. */ static void swap_pager_freeswapspace(object, from, to) vm_object_t object; unsigned int from; unsigned int to; { rlist_free(&swaplist, from, to); vm_swap_size += (to - from) + 1; object->un_pager.swp.swp_allocsize -= (to - from) + 1; swapsizecheck(); } /* * this routine frees swap blocks from a specified pager */ void swap_pager_freespace(object, start, size) vm_object_t object; vm_pindex_t start; vm_size_t size; { vm_pindex_t i; int s; s = splbio(); for (i = start; i < start + size; i += 1) { int valid; daddr_t *addr = swap_pager_diskaddr(object, i, &valid); if (addr && *addr != SWB_EMPTY) { swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1); if (valid) { swap_pager_setvalid(object, i, 0); } *addr = SWB_EMPTY; } } splx(s); } /* * same as freespace, but don't free, just force a DMZ next time */ void swap_pager_dmzspace(object, start, size) vm_object_t object; vm_pindex_t start; vm_size_t size; { vm_pindex_t i; int s; s = splbio(); for (i = start; i < start + size; i += 1) { int valid; daddr_t *addr = swap_pager_diskaddr(object, i, &valid); if (addr && *addr != SWB_EMPTY) { if (valid) { swap_pager_setvalid(object, i, 0); } } } splx(s); } static void swap_pager_free_swap(object) vm_object_t object; { register int i, j; register sw_blk_t swb; int first_block=0, block_count=0; int s; /* * Free left over swap blocks */ s = splbio(); for (i = 0, swb = object->un_pager.swp.swp_blocks; i < object->un_pager.swp.swp_nblocks; i++, swb++) { for (j = 0; j < SWB_NPAGES; j++) { if (swb->swb_block[j] != SWB_EMPTY) { /* * initially the length of the run is zero */ if (block_count == 0) { first_block = swb->swb_block[j]; block_count = btodb(PAGE_SIZE); swb->swb_block[j] = SWB_EMPTY; /* * if the new block can be included into the current run */ } else if (swb->swb_block[j] == first_block + block_count) { block_count += btodb(PAGE_SIZE); swb->swb_block[j] = SWB_EMPTY; /* * terminate the previous run, and start a new one */ } else { swap_pager_freeswapspace(object, first_block, (unsigned) first_block + block_count - 1); first_block = swb->swb_block[j]; block_count = btodb(PAGE_SIZE); swb->swb_block[j] = SWB_EMPTY; } } } } if (block_count) { swap_pager_freeswapspace(object, first_block, (unsigned) first_block + block_count - 1); } splx(s); } /* * swap_pager_reclaim frees up over-allocated space from all pagers * this eliminates internal fragmentation due to allocation of space * for segments that are never swapped to. It has been written so that * it does not block until the rlist_free operation occurs; it keeps * the queues consistant. */ /* * Maximum number of blocks (pages) to reclaim per pass */ #define MAXRECLAIM 128 static void swap_pager_reclaim() { vm_object_t object; int i, j, k; int s; int reclaimcount; static struct { int address; vm_object_t object; } reclaims[MAXRECLAIM]; static int in_reclaim; /* * allow only one process to be in the swap_pager_reclaim subroutine */ s = splbio(); if (in_reclaim) { tsleep(&in_reclaim, PSWP, "swrclm", 0); splx(s); return; } in_reclaim = 1; reclaimcount = 0; /* for each pager queue */ for (k = 0; swp_qs[k]; k++) { object = TAILQ_FIRST(swp_qs[k]); while (object && (reclaimcount < MAXRECLAIM)) { /* * see if any blocks associated with a pager has been * allocated but not used (written) */ if (object->paging_in_progress == 0) { for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) { sw_blk_t swb = &object->un_pager.swp.swp_blocks[i]; if (swb->swb_locked) continue; for (j = 0; j < SWB_NPAGES; j++) { if (swb->swb_block[j] != SWB_EMPTY && (swb->swb_valid & (1 << j)) == 0) { reclaims[reclaimcount].address = swb->swb_block[j]; reclaims[reclaimcount++].object = object; swb->swb_block[j] = SWB_EMPTY; if (reclaimcount >= MAXRECLAIM) goto rfinished; } } } } object = TAILQ_NEXT(object, pager_object_list); } } rfinished: /* * free the blocks that have been added to the reclaim list */ for (i = 0; i < reclaimcount; i++) { swap_pager_freeswapspace(reclaims[i].object, reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1); } splx(s); in_reclaim = 0; wakeup(&in_reclaim); } /* * swap_pager_copy copies blocks from one pager to another and * destroys the source pager */ void swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset, offset) vm_object_t srcobject; vm_pindex_t srcoffset; vm_object_t dstobject; vm_pindex_t dstoffset; vm_pindex_t offset; { vm_pindex_t i; int origsize; int s; if (vm_swap_size) no_swap_space = 0; origsize = srcobject->un_pager.swp.swp_allocsize; /* * remove the source object from the swap_pager internal queue */ if (srcobject->handle == NULL) { TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list); } else { TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list); } s = splbio(); while (srcobject->un_pager.swp.swp_poip) { tsleep(srcobject, PVM, "spgout", 0); } splx(s); /* * clean all of the pages that are currently active and finished */ swap_pager_sync(); s = splbio(); /* * transfer source to destination */ for (i = 0; i < dstobject->size; i += 1) { int srcvalid, dstvalid; daddr_t *srcaddrp = swap_pager_diskaddr(srcobject, i + offset + srcoffset, &srcvalid); daddr_t *dstaddrp; /* * see if the source has space allocated */ if (srcaddrp && *srcaddrp != SWB_EMPTY) { /* * if the source is valid and the dest has no space, * then copy the allocation from the srouce to the * dest. */ if (srcvalid) { dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset, &dstvalid); /* * if the dest already has a valid block, * deallocate the source block without * copying. */ if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) { swap_pager_freeswapspace(dstobject, *dstaddrp, *dstaddrp + btodb(PAGE_SIZE) - 1); *dstaddrp = SWB_EMPTY; } if (dstaddrp && *dstaddrp == SWB_EMPTY) { *dstaddrp = *srcaddrp; *srcaddrp = SWB_EMPTY; dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE); srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE); swap_pager_setvalid(dstobject, i + dstoffset, 1); } } /* * if the source is not empty at this point, then * deallocate the space. */ if (*srcaddrp != SWB_EMPTY) { swap_pager_freeswapspace(srcobject, *srcaddrp, *srcaddrp + btodb(PAGE_SIZE) - 1); *srcaddrp = SWB_EMPTY; } } } splx(s); /* * Free left over swap blocks */ swap_pager_free_swap(srcobject); if (srcobject->un_pager.swp.swp_allocsize) { printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n", srcobject->un_pager.swp.swp_allocsize, origsize); } free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA); srcobject->un_pager.swp.swp_blocks = NULL; return; } static void swap_pager_dealloc(object) vm_object_t object; { int s; /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle == NULL) { TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list); } else { TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list); } /* * Wait for all pageouts to finish and remove all entries from * cleaning list. */ s = splbio(); while (object->un_pager.swp.swp_poip) { tsleep(object, PVM, "swpout", 0); } splx(s); swap_pager_sync(); /* * Free left over swap blocks */ swap_pager_free_swap(object); if (object->un_pager.swp.swp_allocsize) { printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n", object->un_pager.swp.swp_allocsize); } /* * Free swap management resources */ free(object->un_pager.swp.swp_blocks, M_VMPGDATA); object->un_pager.swp.swp_blocks = NULL; } static inline __pure int swap_pager_block_index(pindex) vm_pindex_t pindex; { return (pindex / SWB_NPAGES); } static inline __pure int swap_pager_block_offset(pindex) vm_pindex_t pindex; { return (pindex % SWB_NPAGES); } /* * swap_pager_haspage returns TRUE if the pager has data that has * been written out. */ static boolean_t swap_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { register sw_blk_t swb; int ix; if (before != NULL) *before = 0; if (after != NULL) *after = 0; ix = pindex / SWB_NPAGES; if (ix >= object->un_pager.swp.swp_nblocks) { return (FALSE); } swb = &object->un_pager.swp.swp_blocks[ix]; ix = pindex % SWB_NPAGES; if (swb->swb_block[ix] != SWB_EMPTY) { if (swb->swb_valid & (1 << ix)) { int tix; if (before) { for(tix = ix - 1; tix >= 0; --tix) { if ((swb->swb_valid & (1 << tix)) == 0) break; if ((swb->swb_block[tix] + (ix - tix) * (PAGE_SIZE/DEV_BSIZE)) != swb->swb_block[ix]) break; (*before)++; } } if (after) { for(tix = ix + 1; tix < SWB_NPAGES; tix++) { if ((swb->swb_valid & (1 << tix)) == 0) break; if ((swb->swb_block[tix] - (tix - ix) * (PAGE_SIZE/DEV_BSIZE)) != swb->swb_block[ix]) break; (*after)++; } } return TRUE; } } return (FALSE); } /* * swap_pager_freepage is a convienience routine that clears the busy * bit and deallocates a page. */ static void swap_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * swap_pager_ridpages is a convienience routine that deallocates all * but the required page. this is usually used in error returns that * need to invalidate the "extra" readahead pages. */ static void swap_pager_ridpages(m, count, reqpage) vm_page_t *m; int count; int reqpage; { int i; for (i = 0; i < count; i++) if (i != reqpage) swap_pager_freepage(m[i]); } /* * swap_pager_iodone1 is the completion routine for both reads and async writes */ static void swap_pager_iodone1(bp) struct buf *bp; { bp->b_flags |= B_DONE; bp->b_flags &= ~B_ASYNC; wakeup(bp); } static int swap_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count, reqpage; { register struct buf *bp; sw_blk_t swb[count]; register int s; int i; boolean_t rv; vm_offset_t kva, off[count]; swp_clean_t spc; vm_pindex_t paging_offset; int reqaddr[count]; int sequential; int first, last; int failed; int reqdskregion; object = m[reqpage]->object; paging_offset = OFF_TO_IDX(object->paging_offset); sequential = (m[reqpage]->pindex == (object->last_read + 1)); for (i = 0; i < count; i++) { vm_pindex_t fidx = m[i]->pindex + paging_offset; int ix = swap_pager_block_index(fidx); if (ix >= object->un_pager.swp.swp_nblocks) { int j; if (i <= reqpage) { swap_pager_ridpages(m, count, reqpage); return (VM_PAGER_FAIL); } for (j = i; j < count; j++) { swap_pager_freepage(m[j]); } count = i; break; } swb[i] = &object->un_pager.swp.swp_blocks[ix]; off[i] = swap_pager_block_offset(fidx); reqaddr[i] = swb[i]->swb_block[off[i]]; } /* make sure that our required input request is existant */ if (reqaddr[reqpage] == SWB_EMPTY || (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) { swap_pager_ridpages(m, count, reqpage); return (VM_PAGER_FAIL); } reqdskregion = reqaddr[reqpage] / dmmax; /* * search backwards for the first contiguous page to transfer */ failed = 0; first = 0; for (i = reqpage - 1; i >= 0; --i) { if (sequential || failed || (reqaddr[i] == SWB_EMPTY) || (swb[i]->swb_valid & (1 << off[i])) == 0 || (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || ((reqaddr[i] / dmmax) != reqdskregion)) { failed = 1; swap_pager_freepage(m[i]); if (first == 0) first = i + 1; } } /* * search forwards for the last contiguous page to transfer */ failed = 0; last = count; for (i = reqpage + 1; i < count; i++) { if (failed || (reqaddr[i] == SWB_EMPTY) || (swb[i]->swb_valid & (1 << off[i])) == 0 || (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || ((reqaddr[i] / dmmax) != reqdskregion)) { failed = 1; swap_pager_freepage(m[i]); if (last == count) last = i; } } count = last; if (first != 0) { for (i = first; i < count; i++) { m[i - first] = m[i]; reqaddr[i - first] = reqaddr[i]; off[i - first] = off[i]; } count -= first; reqpage -= first; } ++swb[reqpage]->swb_locked; /* * at this point: "m" is a pointer to the array of vm_page_t for * paging I/O "count" is the number of vm_page_t entries represented * by "m" "object" is the vm_object_t for I/O "reqpage" is the index * into "m" for the page actually faulted */ spc = NULL; if ((count == 1) && ((spc = TAILQ_FIRST(&swap_pager_free)) != NULL)) { TAILQ_REMOVE(&swap_pager_free, spc, spc_list); swap_pager_free_count--; kva = spc->spc_kva; bp = spc->spc_bp; bzero(bp, sizeof *bp); bp->b_spc = spc; bp->b_vnbufs.le_next = NOLIST; } else { /* * Get a swap buffer header to perform the IO */ bp = getpbuf(); kva = (vm_offset_t) bp->b_data; } /* * map our page(s) into kva for input */ pmap_qenter(kva, m, count); bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING; bp->b_iodone = swap_pager_iodone1; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; crhold(bp->b_rcred); crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = reqaddr[0]; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; pbgetvp(swapdev_vp, bp); cnt.v_swapin++; cnt.v_swappgsin += count; /* * perform the I/O */ VOP_STRATEGY(bp); /* * wait for the sync I/O to complete */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { if (tsleep(bp, PVM, "swread", hz*20)) { printf("swap_pager: indefinite wait buffer: device: %d, blkno: %d, size: %d\n", bp->b_dev, bp->b_blkno, bp->b_bcount); } } if (bp->b_flags & B_ERROR) { printf("swap_pager: I/O error - pagein failed; blkno %d, size %d, error %d\n", bp->b_blkno, bp->b_bcount, bp->b_error); rv = VM_PAGER_ERROR; } else { rv = VM_PAGER_OK; } /* * relpbuf does this, but we maintain our own buffer list also... */ if (bp->b_vp) pbrelvp(bp); splx(s); swb[reqpage]->swb_locked--; /* * remove the mapping for kernel virtual */ pmap_qremove(kva, count); if (spc) { m[reqpage]->object->last_read = m[reqpage]->pindex; if (bp->b_flags & B_WANTED) wakeup(bp); /* * if we have used an spc, we need to free it. */ if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); swap_pager_free_count++; if (swap_pager_needflags & SWAP_FREE_NEEDED) { wakeup(&swap_pager_free); } if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) pagedaemon_wakeup(); swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT); if (rv == VM_PAGER_OK) { - pmap_tc_modified(m[reqpage]); + pmap_clear_modify(VM_PAGE_TO_PHYS(m[reqpage])); m[reqpage]->valid = VM_PAGE_BITS_ALL; m[reqpage]->dirty = 0; } } else { /* * release the physical I/O buffer */ relpbuf(bp); /* * finish up input if everything is ok */ if (rv == VM_PAGER_OK) { for (i = 0; i < count; i++) { - pmap_tc_modified(m[i]); + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->dirty = 0; m[i]->flags &= ~PG_ZERO; if (i != reqpage) { /* * whether or not to leave the page * activated is up in the air, but we * should put the page on a page queue * somewhere. (it already is in the * object). After some emperical * results, it is best to deactivate * the readahead pages. */ vm_page_deactivate(m[i]); /* * just in case someone was asking for * this page we now tell them that it * is ok to use */ m[i]->valid = VM_PAGE_BITS_ALL; PAGE_WAKEUP(m[i]); } } m[reqpage]->object->last_read = m[count-1]->pindex; /* * If we're out of swap space, then attempt to free * some whenever multiple pages are brought in. We * must set the dirty bits so that the page contents * will be preserved. */ if (SWAPLOW) { for (i = 0; i < count; i++) { m[i]->dirty = VM_PAGE_BITS_ALL; } swap_pager_freespace(object, m[0]->pindex + paging_offset, count); } } else { swap_pager_ridpages(m, count, reqpage); } } return (rv); } int swap_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { register struct buf *bp; sw_blk_t swb[count]; register int s; int i, j, ix; boolean_t rv; vm_offset_t kva, off, fidx; swp_clean_t spc; vm_pindex_t paging_pindex; int reqaddr[count]; int failed; if (vm_swap_size) no_swap_space = 0; if (no_swap_space) { for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_FAIL; return VM_PAGER_FAIL; } spc = NULL; object = m[0]->object; paging_pindex = OFF_TO_IDX(object->paging_offset); failed = 0; for (j = 0; j < count; j++) { fidx = m[j]->pindex + paging_pindex; ix = swap_pager_block_index(fidx); swb[j] = 0; if (ix >= object->un_pager.swp.swp_nblocks) { rtvals[j] = VM_PAGER_FAIL; failed = 1; continue; } else { rtvals[j] = VM_PAGER_OK; } swb[j] = &object->un_pager.swp.swp_blocks[ix]; swb[j]->swb_locked++; if (failed) { rtvals[j] = VM_PAGER_FAIL; continue; } off = swap_pager_block_offset(fidx); reqaddr[j] = swb[j]->swb_block[off]; if (reqaddr[j] == SWB_EMPTY) { daddr_t blk; int tries; int ntoget; tries = 0; s = splbio(); /* * if any other pages have been allocated in this * block, we only try to get one page. */ for (i = 0; i < SWB_NPAGES; i++) { if (swb[j]->swb_block[i] != SWB_EMPTY) break; } ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1; /* * this code is alittle conservative, but works (the * intent of this code is to allocate small chunks for * small objects) */ if ((off == 0) && ((fidx + ntoget) > object->size)) { ntoget = object->size - fidx; } retrygetspace: if (!swap_pager_full && ntoget > 1 && swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE), &blk)) { for (i = 0; i < ntoget; i++) { swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i; swb[j]->swb_valid = 0; } reqaddr[j] = swb[j]->swb_block[off]; } else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE), &swb[j]->swb_block[off])) { /* * if the allocation has failed, we try to * reclaim space and retry. */ if (++tries == 1) { swap_pager_reclaim(); goto retrygetspace; } rtvals[j] = VM_PAGER_AGAIN; failed = 1; swap_pager_full = 1; } else { reqaddr[j] = swb[j]->swb_block[off]; swb[j]->swb_valid &= ~(1 << off); } splx(s); } } /* * search forwards for the last contiguous page to transfer */ failed = 0; for (i = 0; i < count; i++) { if (failed || (reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) || ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) || (rtvals[i] != VM_PAGER_OK)) { failed = 1; if (rtvals[i] == VM_PAGER_OK) rtvals[i] = VM_PAGER_AGAIN; } } for (i = 0; i < count; i++) { if (rtvals[i] != VM_PAGER_OK) { if (swb[i]) --swb[i]->swb_locked; } } for (i = 0; i < count; i++) if (rtvals[i] != VM_PAGER_OK) break; if (i == 0) { return VM_PAGER_AGAIN; } count = i; for (i = 0; i < count; i++) { if (reqaddr[i] == SWB_EMPTY) { printf("I/O to empty block???? -- pindex: %d, i: %d\n", m[i]->pindex, i); } } /* * For synchronous writes, we clean up all completed async pageouts. */ if (sync == TRUE) { swap_pager_sync(); } kva = 0; /* * get a swap pager clean data structure, block until we get it */ if (swap_pager_free_count <= 3) { s = splbio(); if (curproc == pageproc) { retryfree: /* * pageout daemon needs a swap control block */ swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT|SWAP_FREE_NEEDED; /* * if it does not get one within a short time, then * there is a potential deadlock, so we go-on trying * to free pages. It is important to block here as opposed * to returning, thereby allowing the pageout daemon to continue. * It is likely that pageout daemon will start suboptimally * reclaiming vnode backed pages if we don't block. Since the * I/O subsystem is probably already fully utilized, might as * well wait. */ if (tsleep(&swap_pager_free, PVM, "swpfre", hz/5)) { swap_pager_sync(); if (swap_pager_free_count <= 3) { splx(s); return VM_PAGER_AGAIN; } } else { /* * we make sure that pageouts aren't taking up all of * the free swap control blocks. */ swap_pager_sync(); if (swap_pager_free_count <= 3) { goto retryfree; } } } else { pagedaemon_wakeup(); while (swap_pager_free_count <= 3) { swap_pager_needflags |= SWAP_FREE_NEEDED; tsleep(&swap_pager_free, PVM, "swpfre", 0); pagedaemon_wakeup(); } } splx(s); } spc = TAILQ_FIRST(&swap_pager_free); if (spc == NULL) panic("swap_pager_putpages: free queue is empty, %d expected\n", swap_pager_free_count); TAILQ_REMOVE(&swap_pager_free, spc, spc_list); swap_pager_free_count--; kva = spc->spc_kva; /* * map our page(s) into kva for I/O */ pmap_qenter(kva, m, count); /* * get the base I/O offset into the swap file */ for (i = 0; i < count; i++) { fidx = m[i]->pindex + paging_pindex; off = swap_pager_block_offset(fidx); /* * set the valid bit */ swb[i]->swb_valid |= (1 << off); /* * and unlock the data structure */ swb[i]->swb_locked--; } /* * Get a swap buffer header and perform the IO */ bp = spc->spc_bp; bzero(bp, sizeof *bp); bp->b_spc = spc; bp->b_vnbufs.le_next = NOLIST; bp->b_flags = B_BUSY | B_PAGING; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_data = (caddr_t) kva; bp->b_blkno = reqaddr[0]; pbgetvp(swapdev_vp, bp); bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; swapdev_vp->v_numoutput++; /* * If this is an async write we set up additional buffer fields and * place a "cleaning" entry on the inuse queue. */ s = splbio(); if (sync == FALSE) { spc->spc_flags = 0; spc->spc_object = object; for (i = 0; i < count; i++) spc->spc_m[i] = m[i]; spc->spc_count = count; /* * the completion routine for async writes */ bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; object->un_pager.swp.swp_poip++; TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); } else { object->un_pager.swp.swp_poip++; bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone1; } cnt.v_swapout++; cnt.v_swappgsout += count; /* * perform the I/O */ VOP_STRATEGY(bp); if (sync == FALSE) { if ((bp->b_flags & B_DONE) == B_DONE) { swap_pager_sync(); } splx(s); for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_PEND; } return VM_PAGER_PEND; } /* * wait for the sync I/O to complete */ while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "swwrt", 0); } if (bp->b_flags & B_ERROR) { printf("swap_pager: I/O error - pageout failed; blkno %d, size %d, error %d\n", bp->b_blkno, bp->b_bcount, bp->b_error); rv = VM_PAGER_ERROR; } else { rv = VM_PAGER_OK; } object->un_pager.swp.swp_poip--; if (object->un_pager.swp.swp_poip == 0) wakeup(object); if (bp->b_vp) pbrelvp(bp); if (bp->b_flags & B_WANTED) wakeup(bp); splx(s); /* * remove the mapping for kernel virtual */ pmap_qremove(kva, count); /* * if we have written the page, then indicate that the page is clean. */ if (rv == VM_PAGER_OK) { for (i = 0; i < count; i++) { if (rtvals[i] == VM_PAGER_OK) { - pmap_tc_modified(m[i]); + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->dirty = 0; /* * optimization, if a page has been read * during the pageout process, we activate it. */ if ((m[i]->queue != PQ_ACTIVE) && ((m[i]->flags & (PG_WANTED|PG_REFERENCED)) || - pmap_tc_referenced(VM_PAGE_TO_PHYS(m[i])))) { + pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))) { vm_page_activate(m[i]); } } } } else { for (i = 0; i < count; i++) { rtvals[i] = rv; } } if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); swap_pager_free_count++; if (swap_pager_needflags & SWAP_FREE_NEEDED) { wakeup(&swap_pager_free); } if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) pagedaemon_wakeup(); swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT); return (rv); } static void swap_pager_sync() { register swp_clean_t spc, tspc; register int s; tspc = NULL; if (TAILQ_FIRST(&swap_pager_done) == NULL) return; for (;;) { s = splbio(); /* * Look up and removal from done list must be done at splbio() * to avoid conflicts with swap_pager_iodone. */ while ((spc = TAILQ_FIRST(&swap_pager_done)) != 0) { pmap_qremove(spc->spc_kva, spc->spc_count); swap_pager_finish(spc); TAILQ_REMOVE(&swap_pager_done, spc, spc_list); goto doclean; } /* * No operations done, thats all we can do for now. */ splx(s); break; /* * The desired page was found to be busy earlier in the scan * but has since completed. */ doclean: if (tspc && tspc == spc) { tspc = NULL; } spc->spc_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); swap_pager_free_count++; if (swap_pager_needflags & SWAP_FREE_NEEDED) { wakeup(&swap_pager_free); } if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) pagedaemon_wakeup(); swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT); splx(s); } return; } void swap_pager_finish(spc) register swp_clean_t spc; { vm_object_t object = spc->spc_m[0]->object; int i; object->paging_in_progress -= spc->spc_count; if ((object->paging_in_progress == 0) && (object->flags & OBJ_PIPWNT)) { object->flags &= ~OBJ_PIPWNT; wakeup(object); } /* * If no error, mark as clean and inform the pmap system. If error, * mark as dirty so we will try again. (XXX could get stuck doing * this, should give up after awhile) */ if (spc->spc_flags & SPC_ERROR) { for (i = 0; i < spc->spc_count; i++) { printf("swap_pager_finish: I/O error, clean of page %lx failed\n", (u_long) VM_PAGE_TO_PHYS(spc->spc_m[i])); } } else { - int pagewanted = 0; for (i = 0; i < spc->spc_count; i++) { - if (spc->spc_m[i]->flags & (PG_WANTED | PG_REFERENCED)) { - pagewanted = 1; - break; - } - } - for (i = 0; i < spc->spc_count; i++) { - pmap_tc_modified(spc->spc_m[i]); + pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i])); spc->spc_m[i]->dirty = 0; - if (pagewanted) { - if (spc->spc_m[i]->queue != PQ_ACTIVE) - vm_page_activate(spc->spc_m[i]); - spc->spc_m[i]->flags |= PG_REFERENCED; - } + if ((spc->spc_m[i]->queue != PQ_ACTIVE) && + ((spc->spc_m[i]->flags & PG_WANTED) || pmap_is_referenced(VM_PAGE_TO_PHYS(spc->spc_m[i])))) + vm_page_activate(spc->spc_m[i]); } } for (i = 0; i < spc->spc_count; i++) { /* * we wakeup any processes that are waiting on these pages. */ PAGE_WAKEUP(spc->spc_m[i]); } nswiodone -= spc->spc_count; return; } /* * swap_pager_iodone */ static void swap_pager_iodone(bp) register struct buf *bp; { register swp_clean_t spc; int s; s = splbio(); spc = (swp_clean_t) bp->b_spc; TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list); if (bp->b_flags & B_ERROR) { spc->spc_flags |= SPC_ERROR; printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n", (bp->b_flags & B_READ) ? "pagein" : "pageout", (u_long) bp->b_blkno, bp->b_bcount, bp->b_error); } if (bp->b_vp) pbrelvp(bp); +/* if (bp->b_flags & B_WANTED) +*/ wakeup(bp); if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); nswiodone += spc->spc_count; if (--spc->spc_object->un_pager.swp.swp_poip == 0) { wakeup(spc->spc_object); } if ((swap_pager_needflags & SWAP_FREE_NEEDED) || TAILQ_FIRST(&swap_pager_inuse) == 0) { swap_pager_needflags &= ~SWAP_FREE_NEEDED; wakeup(&swap_pager_free); } if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) { swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT; pagedaemon_wakeup(); } if (vm_pageout_pages_needed) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } if ((TAILQ_FIRST(&swap_pager_inuse) == NULL) || ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min && nswiodone + cnt.v_free_count + cnt.v_cache_count >= cnt.v_free_min)) { pagedaemon_wakeup(); } splx(s); } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 17333) +++ head/sys/vm/vm_fault.c (revision 17334) @@ -1,1139 +1,1146 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_fault.c,v 1.54 1996/07/27 03:23:52 dyson Exp $ + * $Id: vm_fault.c,v 1.55 1996/07/28 01:14:01 dyson Exp $ */ /* * Page fault handling module. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int vm_fault_additional_pages __P((vm_page_t, int, int, vm_page_t *, int *)); #define VM_FAULT_READ_AHEAD 4 #define VM_FAULT_READ_BEHIND 3 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1) +int vm_fault_free_1; +int vm_fault_copy_save_1; +int vm_fault_copy_save_2; + /* * vm_fault: * * Handle a page fault occuring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(map, vaddr, fault_type, change_wiring) vm_map_t map; vm_offset_t vaddr; vm_prot_t fault_type; boolean_t change_wiring; { vm_object_t first_object; vm_pindex_t first_pindex; vm_map_entry_t entry; register vm_object_t object; register vm_pindex_t pindex; vm_page_t m; vm_page_t first_m; vm_prot_t prot; int result; boolean_t wired; boolean_t su; boolean_t lookup_still_valid; vm_page_t old_m; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ]; int hardfault = 0; struct vnode *vp = NULL; cnt.v_vm_faults++; /* needs lock XXX */ /* * Recovery actions */ #define FREE_PAGE(m) { \ PAGE_WAKEUP(m); \ vm_page_free(m); \ } #define RELEASE_PAGE(m) { \ PAGE_WAKEUP(m); \ if (m->queue != PQ_ACTIVE) vm_page_activate(m); \ } #define UNLOCK_MAP { \ if (lookup_still_valid) { \ vm_map_lookup_done(map, entry); \ lookup_still_valid = FALSE; \ } \ } #define UNLOCK_THINGS { \ vm_object_pip_wakeup(object); \ if (object != first_object) { \ FREE_PAGE(first_m); \ vm_object_pip_wakeup(first_object); \ } \ UNLOCK_MAP; \ if (vp != NULL) VOP_UNLOCK(vp); \ } #define UNLOCK_AND_DEALLOCATE { \ UNLOCK_THINGS; \ vm_object_deallocate(first_object); \ } RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry, &first_object, &first_pindex, &prot, &wired, &su)) != KERN_SUCCESS) { return (result); } vp = vnode_pager_lock(first_object); lookup_still_valid = TRUE; if (wired) fault_type = prot; first_m = NULL; /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. */ first_object->ref_count++; first_object->paging_in_progress++; /* * INVARIANTS (through entire routine): * * 1) At all times, we must either have the object lock or a busy * page in some object to prevent some other process from trying to * bring in the same page. * * Note that we cannot hold any locks during the pager access or when * waiting for memory, so we use a busy page then. * * Note also that we aren't as concerned about more than one thead * attempting to pager_data_unlock the same page at once, so we don't * hold the page as busy then, but do record the highest unlock value * so far. [Unlock requests may also be delivered out of order.] * * 2) Once we have a busy page, we must remove it from the pageout * queues, so that the pageout daemon will not grab it away. * * 3) To prevent another process from racing us down the shadow chain * and entering a new page in the top object before we do, we must * keep a busy page in the top object while following the shadow * chain. * * 4) We must increment paging_in_progress on any object for which * we have a busy page, to prevent vm_object_collapse from removing * the busy page without our noticing. */ /* * Search for the page at object/offset. */ object = first_object; pindex = first_pindex; /* * See whether this page is resident */ while (TRUE) { m = vm_page_lookup(object, pindex); if (m != NULL) { int queue; /* * If the page is being brought in, wait for it and * then retry. */ if ((m->flags & PG_BUSY) || m->busy) { int s; UNLOCK_THINGS; s = splvm(); if (((m->flags & PG_BUSY) || m->busy)) { m->flags |= PG_WANTED | PG_REFERENCED; cnt.v_intrans++; tsleep(m, PSWP, "vmpfw", 0); } splx(s); vm_object_deallocate(first_object); goto RetryFault; } queue = m->queue; - vm_page_unqueue(m,0); + vm_page_unqueue_nowakeup(m); /* * Mark page busy for other processes, and the pagedaemon. */ if ((queue == PQ_CACHE) && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { vm_page_activate(m); UNLOCK_AND_DEALLOCATE; VM_WAIT; goto RetryFault; } m->flags |= PG_BUSY; if (m->valid && ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) && m->object != kernel_object && m->object != kmem_object) { goto readrest; } break; } if (((object->type != OBJT_DEFAULT) && (!change_wiring || wired)) || (object == first_object)) { if (pindex >= object->size) { UNLOCK_AND_DEALLOCATE; return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. */ m = vm_page_alloc(object, pindex, (vp || object->backing_object)?VM_ALLOC_NORMAL:VM_ALLOC_ZERO); if (m == NULL) { UNLOCK_AND_DEALLOCATE; VM_WAIT; goto RetryFault; } } readrest: if (object->type != OBJT_DEFAULT && (!change_wiring || wired)) { int rv; int faultcount; int reqpage; int ahead, behind; ahead = VM_FAULT_READ_AHEAD; behind = VM_FAULT_READ_BEHIND; if (first_object->behavior == OBJ_RANDOM) { ahead = 0; behind = 0; } if ((first_object->type != OBJT_DEVICE) && (first_object->behavior == OBJ_SEQUENTIAL)) { vm_pindex_t firstpindex, tmppindex; if (first_pindex < 2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1)) firstpindex = 0; else firstpindex = first_pindex - 2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1); for(tmppindex = first_pindex - 1; tmppindex >= first_pindex; --tmppindex) { vm_page_t mt; mt = vm_page_lookup( first_object, tmppindex); if (mt == NULL || (mt->valid != VM_PAGE_BITS_ALL)) break; if (mt->busy || (mt->flags & (PG_BUSY|PG_FICTITIOUS)) || mt->hold_count || mt->wire_count) continue; if (mt->dirty == 0) vm_page_test_dirty(mt); if (mt->dirty) { vm_page_protect(mt, VM_PROT_NONE); vm_page_deactivate(mt); } else { vm_page_cache(mt); } } ahead += behind; behind = 0; } /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. */ faultcount = vm_fault_additional_pages( m, behind, ahead, marray, &reqpage); /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. */ UNLOCK_MAP; rv = faultcount ? vm_pager_get_pages(object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ m = vm_page_lookup(object, pindex); if( !m) { UNLOCK_AND_DEALLOCATE; goto RetryFault; } hardfault++; break; } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager input (probably hardware) error, PID %d failure\n", curproc->p_pid); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { FREE_PAGE(m); UNLOCK_AND_DEALLOCATE; return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (object != first_object) { FREE_PAGE(m); /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) or the * pager doesn't have the page. */ if (object == first_object) first_m = m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ pindex += OFF_TO_IDX(object->backing_object_offset); next_object = object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (object != first_object) { vm_object_pip_wakeup(object); object = first_object; pindex = first_pindex; m = first_m; } first_m = NULL; if ((m->flags & PG_ZERO) == 0) vm_page_zero_fill(m); cnt.v_zfod++; break; } else { if (object != first_object) { vm_object_pip_wakeup(object); } object = next_object; object->paging_in_progress++; } } if ((m->flags & PG_BUSY) == 0) panic("vm_fault: not busy after main loop"); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ old_m = m; /* save page that would be copied */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (object != first_object) { /* * We only really need to copy if we want to write it. */ if (fault_type & VM_PROT_WRITE) { /* * This allows pages to be virtually copied from a backing_object * into the first_object, where the backing object has no other * refs to it, and cannot gain any more refs. Instead of a * bcopy, we just move the page from the backing object to the * first object. Note that we must mark the page dirty in the * first object so that it will go out to swap when needed. */ if (lookup_still_valid && /* * Only one shadow object */ (object->shadow_count == 1) && /* * No COW refs, except us */ (object->ref_count == 1) && /* * Noone else can look this object up */ (object->handle == NULL) && /* * No other ways to look the object up */ ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) && /* * We don't chase down the shadow chain */ (object == first_object->backing_object)) { /* * get rid of the unnecessary page */ vm_page_protect(first_m, VM_PROT_NONE); PAGE_WAKEUP(first_m); vm_page_free(first_m); /* * grab the page and put it into the process'es object */ vm_page_rename(m, first_object, first_pindex); first_m = m; m->dirty = VM_PAGE_BITS_ALL; m = NULL; + ++vm_fault_copy_save_1; } else { /* * Oh, well, lets copy it. */ vm_page_copy(m, first_m); } /* * This code handles the case where there are two references to the * backing object, and one reference is getting a copy of the * page. If the other reference is the only other object that * points to the backing object, then perform a virtual copy * from the backing object to the other object after the * page is copied to the current first_object. If the other * object already has the page, we destroy it in the backing object * performing an optimized collapse-type operation. We don't * bother removing the page from the backing object's swap space. */ if (lookup_still_valid && /* * make sure that we have two shadow objs */ (object->shadow_count == 2) && /* * And no COW refs -- note that there are sometimes * temp refs to objs, but ignore that case -- we just * punt. */ (object->ref_count == 2) && /* * Noone else can look us up */ (object->handle == NULL) && /* * Not something that can be referenced elsewhere */ ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) && /* * We don't bother chasing down object chain */ (object == first_object->backing_object)) { vm_object_t other_object; vm_pindex_t other_pindex, other_pindex_offset; vm_page_t tm; other_object = TAILQ_FIRST(&object->shadow_head); if (other_object == first_object) other_object = TAILQ_NEXT(other_object, shadow_list); if (!other_object) panic("vm_fault: other object missing"); if (other_object && (other_object->type == OBJT_DEFAULT) && (other_object->paging_in_progress == 0)) { other_pindex_offset = OFF_TO_IDX(other_object->backing_object_offset); if (pindex >= other_pindex_offset) { other_pindex = pindex - other_pindex_offset; /* * If the other object has the page, just free it. */ if ((tm = vm_page_lookup(other_object, other_pindex))) { if ((tm->flags & PG_BUSY) == 0 && tm->busy == 0 && tm->valid == VM_PAGE_BITS_ALL) { /* * get rid of the unnecessary page */ vm_page_protect(m, VM_PROT_NONE); PAGE_WAKEUP(m); vm_page_free(m); m = NULL; + ++vm_fault_free_1; tm->dirty = VM_PAGE_BITS_ALL; first_m->dirty = VM_PAGE_BITS_ALL; } } else { /* * If the other object doesn't have the page, * then we move it there. */ vm_page_rename(m, other_object, other_pindex); m->dirty = VM_PAGE_BITS_ALL; m->valid = VM_PAGE_BITS_ALL; + ++vm_fault_copy_save_2; } } } } if (m) { if (m->queue != PQ_ACTIVE) vm_page_activate(m); - /* - * We no longer need the old page or object. - */ + /* + * We no longer need the old page or object. + */ PAGE_WAKEUP(m); } vm_object_pip_wakeup(object); /* * Only use the new page below... */ cnt.v_cow_faults++; m = first_m; object = first_object; pindex = first_pindex; /* * Now that we've gotten the copy out of the way, * let's try to collapse the top object. * * But we have to play ugly games with * paging_in_progress to do that... */ vm_object_pip_wakeup(object); vm_object_collapse(object); object->paging_in_progress++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; /* * Since map entries may be pageable, make sure we can take a * page fault on them. */ /* * To avoid trying to write_lock the map while another process * has it read_locked (in vm_map_pageable), we do not try for * write permission. If the page is still writable, we will * get write permission. If it is not, or has been marked * needs_copy, we enter the mapping without write permission, * and will merely take another fault. */ result = vm_map_lookup(&map, vaddr, fault_type & ~VM_PROT_WRITE, &entry, &retry_object, &retry_pindex, &retry_prot, &wired, &su); /* * If we don't need the page any longer, put it on the active * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; return (result); } lookup_still_valid = TRUE; if ((retry_object != first_object) || (retry_pindex != first_pindex)) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter may cause other faults. We don't put the page * back on the active queue until later so that the page-out daemon * won't find us (yet). */ if (prot & VM_PROT_WRITE) { m->flags |= PG_WRITEABLE; m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY; /* * If the fault is a write, we know that this page is being * written NOW. This will save on the pmap_is_modified() calls * later. */ if (fault_type & VM_PROT_WRITE) { m->dirty = VM_PAGE_BITS_ALL; } } UNLOCK_THINGS; m->valid = VM_PAGE_BITS_ALL; m->flags &= ~PG_ZERO; pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired); if ((change_wiring == 0) && (wired == 0)) pmap_prefault(map->pmap, vaddr, entry, first_object); m->flags |= PG_MAPPED|PG_REFERENCED; /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (change_wiring) { if (wired) vm_page_wire(m); else vm_page_unwire(m); } else { if (m->queue != PQ_ACTIVE) vm_page_activate(m); } if (curproc && (curproc->p_flag & P_INMEM) && curproc->p_stats) { if (hardfault) { curproc->p_stats->p_ru.ru_majflt++; } else { curproc->p_stats->p_ru.ru_minflt++; } } /* * Unlock everything, and return */ PAGE_WAKEUP(m); vm_object_deallocate(first_object); return (KERN_SUCCESS); } /* * vm_fault_wire: * * Wire down a range of virtual addresses in a map. */ int vm_fault_wire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va; register pmap_t pmap; int rv; pmap = vm_map_pmap(map); /* * Inform the physical mapping system that the range of addresses may * not fault, so that page tables and such can be locked down as well. */ pmap_pageable(pmap, start, end, FALSE); /* * We simulate a fault to get the page and enter it in the physical * map. */ for (va = start; va < end; va += PAGE_SIZE) { rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE, TRUE); if (rv) { if (va != start) vm_fault_unwire(map, start, va); return (rv); } } return (KERN_SUCCESS); } /* * vm_fault_unwire: * * Unwire a range of virtual addresses in a map. */ void vm_fault_unwire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va, pa; register pmap_t pmap; pmap = vm_map_pmap(map); /* * Since the pages are wired down, we must be able to get their * mappings from the physical map system. */ for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_extract(pmap, va); if (pa != (vm_offset_t) 0) { pmap_change_wiring(pmap, va, FALSE); vm_page_unwire(PHYS_TO_VM_PAGE(pa)); } } /* * Inform the physical mapping system that the range of addresses may * fault, so that page tables and such may be unwired themselves. */ pmap_pageable(pmap, start, end, TRUE); } /* * Routine: * vm_fault_copy_entry * Function: * Copy all of the pages from a wired-down map entry to another. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) vm_map_t dst_map; vm_map_t src_map; vm_map_entry_t dst_entry; vm_map_entry_t src_entry; { vm_object_t dst_object; vm_object_t src_object; vm_ooffset_t dst_offset; vm_ooffset_t src_offset; vm_prot_t prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; #ifdef lint src_map++; #endif /* lint */ src_object = src_entry->object.vm_object; src_offset = src_entry->offset; /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, (vm_size_t) OFF_TO_IDX(dst_entry->end - dst_entry->start)); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; prot = dst_entry->max_protection; /* * Loop through all of the pages in the entry's range, copying each * one from the source object (it should be there) to the destination * object. */ for (vaddr = dst_entry->start, dst_offset = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { /* * Allocate a page in the destination object */ do { dst_m = vm_page_alloc(dst_object, OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_WAIT; } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be in * memory.) */ src_m = vm_page_lookup(src_object, OFF_TO_IDX(dst_offset + src_offset)); if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); vm_page_copy(src_m, dst_m); /* * Enter it in the pmap... */ dst_m->flags &= ~PG_ZERO; pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m), prot, FALSE); dst_m->flags |= PG_WRITEABLE|PG_MAPPED; /* * Mark it no longer busy, and put it on the active list. */ vm_page_activate(dst_m); PAGE_WAKEUP(dst_m); } } /* * This routine checks around the requested page for other pages that * might be able to be faulted in. This routine brackets the viable * pages for the pages to be paged in. * * Inputs: * m, rbehind, rahead * * Outputs: * marray (array of vm_page_t), reqpage (index of requested page) * * Return value: * number of pages in marray */ int vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) vm_page_t m; int rbehind; int rahead; vm_page_t *marray; int *reqpage; { int i; vm_object_t object; vm_pindex_t pindex, startpindex, endpindex, tpindex; vm_offset_t size; vm_page_t rtm; int treqpage; int cbehind, cahead; object = m->object; pindex = m->pindex; /* * we don't fault-ahead for device pager */ if (object->type == OBJT_DEVICE) { *reqpage = 0; marray[0] = m; return 1; } /* * if the requested page is not available, then give up now */ if (!vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead)) return 0; if ((cbehind == 0) && (cahead == 0)) { *reqpage = 0; marray[0] = m; return 1; } if (rahead > cahead) { rahead = cahead; } if (rbehind > cbehind) { rbehind = cbehind; } /* * try to do any readahead that we might have free pages for. */ if ((rahead + rbehind) > ((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) { pagedaemon_wakeup(); *reqpage = 0; marray[0] = m; return 1; } /* * scan backward for the read behind pages -- in memory or on disk not * in same object */ tpindex = pindex - 1; if (tpindex < pindex) { if (rbehind > pindex) rbehind = pindex; startpindex = pindex - rbehind; while (tpindex >= startpindex) { if (vm_page_lookup( object, tpindex)) { startpindex = tpindex + 1; break; } if (tpindex == 0) break; tpindex -= 1; } } else { startpindex = pindex; } /* * scan forward for the read ahead pages -- in memory or on disk not * in same object */ tpindex = pindex + 1; endpindex = pindex + (rahead + 1); if (endpindex > object->size) endpindex = object->size; - while (tpindex < endpindex) { + while (tpindex < endpindex) { if ( vm_page_lookup(object, tpindex)) { break; } tpindex += 1; } endpindex = tpindex; /* calculate number of bytes of pages */ size = endpindex - startpindex; /* calculate the page offset of the required page */ treqpage = pindex - startpindex; /* see if we have space (again) */ if ((cnt.v_free_count + cnt.v_cache_count) > (cnt.v_free_reserved + size)) { /* * get our pages and don't block for them */ for (i = 0; i < size; i++) { if (i != treqpage) { rtm = vm_page_alloc(object, startpindex + i, VM_ALLOC_NORMAL); if (rtm == NULL) { if (i < treqpage) { int j; for (j = 0; j < i; j++) { FREE_PAGE(marray[j]); } *reqpage = 0; marray[0] = m; return 1; } else { size = i; *reqpage = treqpage; return size; } } marray[i] = rtm; } else { marray[i] = m; } } *reqpage = treqpage; return size; } *reqpage = 0; marray[0] = m; return 1; } Index: head/sys/vm/vm_map.c =================================================================== --- head/sys/vm/vm_map.c (revision 17333) +++ head/sys/vm/vm_map.c (revision 17334) @@ -1,2449 +1,2440 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_map.c,v 1.52 1996/07/07 03:27:41 davidg Exp $ + * $Id: vm_map.c,v 1.53 1996/07/27 03:23:56 dyson Exp $ */ /* * Virtual memory mapping module. */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a single hint is used to speed up lookups. * * In order to properly represent the sharing of virtual * memory regions among maps, the map structure is bi-level. * Top-level ("address") maps refer to regions of sharable * virtual memory. These regions are implemented as * ("sharing") maps, which then refer to the actual virtual * memory objects. When two address maps "share" memory, * their top-level maps both have references to the same * sharing map. When memory is virtual-copied from one * address map to another, the references in the sharing * maps are actually copied -- no copying occurs at the * virtual memory object level. * * Since portions of maps are specified by start/end addreses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * No attempt is currently made to "glue back together" two * abutting entries. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one sharing map to * another, and then marking both regions as copy-on-write. * It is important to note that only one writeable reference * to a VM object region exists in any map -- this means that * shadow object creation can be delayed until a write operation * occurs. */ /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ vm_offset_t kentry_data; vm_size_t kentry_data_size; static vm_map_entry_t kentry_free; static vm_map_t kmap_free; extern char kstack[]; static int kentry_count; static vm_offset_t mapvm_start, mapvm, mapvmmax; static int mapvmpgcnt; static struct vm_map_entry *mappool; static int mappoolcnt; #define KENTRY_LOW_WATER 128 static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t)); static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t)); static vm_map_entry_t vm_map_entry_create __P((vm_map_t)); static void vm_map_entry_delete __P((vm_map_t, vm_map_entry_t)); static __inline void vm_map_entry_dispose __P((vm_map_t, vm_map_entry_t)); static void vm_map_entry_unwire __P((vm_map_t, vm_map_entry_t)); static void vm_map_copy_entry __P((vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t)); static void vm_map_simplify_entry __P((vm_map_t, vm_map_entry_t)); -static __pure int vm_map_simplify_okay __P((vm_map_entry_t entry1, - vm_map_entry_t entry2)); void vm_map_startup() { register int i; register vm_map_entry_t mep; vm_map_t mp; /* * Static map structures for allocation before initialization of * kernel map or kmem map. vm_map_create knows how to deal with them. */ kmap_free = mp = (vm_map_t) kentry_data; i = MAX_KMAP; while (--i > 0) { mp->header.next = (vm_map_entry_t) (mp + 1); mp++; } mp++->header.next = NULL; /* * Form a free list of statically allocated kernel map entries with * the rest. */ kentry_free = mep = (vm_map_entry_t) mp; kentry_count = i = (kentry_data_size - MAX_KMAP * sizeof *mp) / sizeof *mep; while (--i > 0) { mep->next = mep + 1; mep++; } mep->next = NULL; } /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. * The remaining fields must be initialized by the caller. */ struct vmspace * vmspace_alloc(min, max, pageable) vm_offset_t min, max; int pageable; { register struct vmspace *vm; if (mapvmpgcnt == 0 && mapvm == 0) { mapvmpgcnt = (cnt.v_page_count * sizeof(struct vm_map_entry) + PAGE_SIZE - 1) / PAGE_SIZE; mapvm_start = mapvm = kmem_alloc_pageable(kernel_map, mapvmpgcnt * PAGE_SIZE); mapvmmax = mapvm_start + mapvmpgcnt * PAGE_SIZE; if (!mapvm) mapvmpgcnt = 0; } MALLOC(vm, struct vmspace *, sizeof(struct vmspace), M_VMMAP, M_WAITOK); bzero(vm, (caddr_t) &vm->vm_startcopy - (caddr_t) vm); vm_map_init(&vm->vm_map, min, max, pageable); pmap_pinit(&vm->vm_pmap); vm->vm_map.pmap = &vm->vm_pmap; /* XXX */ + vm->vm_pmap.pm_map = &vm->vm_map; vm->vm_refcnt = 1; return (vm); } void vmspace_free(vm) register struct vmspace *vm; { if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); if (--vm->vm_refcnt == 0) { /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ vm_map_lock(&vm->vm_map); (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, vm->vm_map.max_offset); vm_map_unlock(&vm->vm_map); while( vm->vm_map.ref_count != 1) tsleep(&vm->vm_map.ref_count, PVM, "vmsfre", 0); --vm->vm_map.ref_count; vm_object_pmap_remove(vm->vm_upages_obj, 0, vm->vm_upages_obj->size); vm_object_deallocate(vm->vm_upages_obj); pmap_release(&vm->vm_pmap); FREE(vm, M_VMMAP); } else { wakeup(&vm->vm_map.ref_count); } } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ vm_map_t vm_map_create(pmap, min, max, pageable) pmap_t pmap; vm_offset_t min, max; boolean_t pageable; { register vm_map_t result; if (kmem_map == NULL) { result = kmap_free; kmap_free = (vm_map_t) result->header.next; if (result == NULL) panic("vm_map_create: out of maps"); } else MALLOC(result, vm_map_t, sizeof(struct vm_map), M_VMMAP, M_WAITOK); vm_map_init(result, min, max, pageable); result->pmap = pmap; return (result); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. * The pmap is set elsewhere. */ void vm_map_init(map, min, max, pageable) register struct vm_map *map; vm_offset_t min, max; boolean_t pageable; { map->header.next = map->header.prev = &map->header; map->nentries = 0; map->size = 0; map->ref_count = 1; map->is_main_map = TRUE; map->min_offset = min; map->max_offset = max; map->entries_pageable = pageable; map->first_free = &map->header; map->hint = &map->header; map->timestamp = 0; lock_init(&map->lock, TRUE); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static __inline void vm_map_entry_dispose(map, entry) vm_map_t map; vm_map_entry_t entry; { int s; if (kentry_count < KENTRY_LOW_WATER) { s = splvm(); entry->next = kentry_free; kentry_free = entry; ++kentry_count; splx(s); } else { entry->next = mappool; mappool = entry; ++mappoolcnt; } } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. This routine is */ static vm_map_entry_t vm_map_entry_create(map) vm_map_t map; { vm_map_entry_t entry; int i; int s; /* * This is a *very* nasty (and sort of incomplete) hack!!!! */ if (kentry_count < KENTRY_LOW_WATER) { s = splvm(); if (mapvmpgcnt && mapvm) { vm_page_t m; m = vm_page_alloc(kernel_object, OFF_TO_IDX(mapvm - VM_MIN_KERNEL_ADDRESS), (map == kmem_map || map == mb_map) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL); if (m) { int newentries; newentries = (PAGE_SIZE / sizeof(struct vm_map_entry)); vm_page_wire(m); PAGE_WAKEUP(m); m->valid = VM_PAGE_BITS_ALL; pmap_kenter(mapvm, VM_PAGE_TO_PHYS(m)); m->flags |= PG_WRITEABLE; entry = (vm_map_entry_t) mapvm; mapvm += PAGE_SIZE; --mapvmpgcnt; for (i = 0; i < newentries; i++) { vm_map_entry_dispose(kernel_map, entry); entry++; } } } splx(s); } if (map == kernel_map || map == kmem_map || map == mb_map || map == pager_map) { s = splvm(); entry = kentry_free; if (entry) { kentry_free = entry->next; --kentry_count; } else { panic("vm_map_entry_create: out of map entries for kernel"); } splx(s); } else { entry = mappool; if (entry) { mappool = entry->next; --mappoolcnt; } else { MALLOC(entry, vm_map_entry_t, sizeof(struct vm_map_entry), M_VMMAPENT, M_WAITOK); } } return (entry); } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ #define vm_map_entry_link(map, after_where, entry) \ { \ (map)->nentries++; \ (entry)->prev = (after_where); \ (entry)->next = (after_where)->next; \ (entry)->prev->next = (entry); \ (entry)->next->prev = (entry); \ } #define vm_map_entry_unlink(map, entry) \ { \ (map)->nentries--; \ (entry)->next->prev = (entry)->prev; \ (entry)->prev->next = (entry)->next; \ } /* * vm_map_reference: * * Creates another valid reference to the given map. * */ void vm_map_reference(map) register vm_map_t map; { if (map == NULL) return; map->ref_count++; } /* * vm_map_deallocate: * * Removes a reference from the specified map, * destroying it if no references remain. * The map should not be locked. */ void vm_map_deallocate(map) register vm_map_t map; { register int c; if (map == NULL) return; c = map->ref_count; if (c == 0) panic("vm_map_deallocate: deallocating already freed map"); if (c != 1) { --map->ref_count; wakeup(&map->ref_count); return; } /* * Lock the map, to wait out all other references to it. */ vm_map_lock(map); (void) vm_map_delete(map, map->min_offset, map->max_offset); --map->ref_count; if( map->ref_count != 0) { vm_map_unlock(map); return; } pmap_destroy(map->pmap); FREE(map, M_VMMAP); } /* * SAVE_HINT: * * Saves the specified entry as the hint for * future lookups. */ #define SAVE_HINT(map,value) \ (map)->hint = (value); /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry(map, address, entry) register vm_map_t map; register vm_offset_t address; vm_map_entry_t *entry; /* OUT */ { register vm_map_entry_t cur; register vm_map_entry_t last; /* * Start looking either from the head of the list, or from the hint. */ cur = map->hint; if (cur == &map->header) cur = cur->next; if (address >= cur->start) { /* * Go from hint to end of list. * * But first, make a quick check to see if we are already looking * at the entry we want (which is usually the case). Note also * that we don't need to save the hint here... it is the same * hint (unless we are at the header, in which case the hint * didn't buy us anything anyway). */ last = &map->header; if ((cur != last) && (cur->end > address)) { *entry = cur; return (TRUE); } } else { /* * Go from start to hint, *inclusively* */ last = cur->next; cur = map->header.next; } /* * Search linearly */ while (cur != last) { if (cur->end > address) { if (address >= cur->start) { /* * Save this lookup for future hints, and * return */ *entry = cur; SAVE_HINT(map, cur); return (TRUE); } break; } cur = cur->next; } *entry = cur->prev; SAVE_HINT(map, *entry); return (FALSE); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. */ int vm_map_insert(map, object, offset, start, end, prot, max, cow) vm_map_t map; vm_object_t object; vm_ooffset_t offset; vm_offset_t start; vm_offset_t end; vm_prot_t prot, max; int cow; { register vm_map_entry_t new_entry; register vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; vm_object_t prev_object; /* * Check that the start and end points are not bogus. */ if ((start < map->min_offset) || (end > map->max_offset) || (start >= end)) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &temp_entry)) return (KERN_NO_SPACE); prev_entry = temp_entry; /* * Assert that the next entry doesn't overlap the end point. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < end)) return (KERN_NO_SPACE); if ((prev_entry != &map->header) && - (object == NULL) && (prev_entry->end == start) && + ((object == NULL) || (prev_entry->object.vm_object == object)) && (prev_entry->is_a_map == FALSE) && (prev_entry->is_sub_map == FALSE) && (prev_entry->inheritance == VM_INHERIT_DEFAULT) && (prev_entry->protection == prot) && (prev_entry->max_protection == max) && (prev_entry->wired_count == 0)) { /* * See if we can avoid creating a new entry by extending one of our * neighbors. */ - if (vm_object_coalesce(prev_entry->object.vm_object, - OFF_TO_IDX(prev_entry->offset), - (vm_size_t) (prev_entry->end - - prev_entry->start), - (vm_size_t) (end - prev_entry->end))) { + if (object == NULL) { + if (vm_object_coalesce(prev_entry->object.vm_object, + OFF_TO_IDX(prev_entry->offset), + (vm_size_t) (prev_entry->end + - prev_entry->start), + (vm_size_t) (end - prev_entry->end))) { - /* - * Coalesced the two objects - can extend the - * previous map entry to include the new - * range. - */ - map->size += (end - prev_entry->end); - prev_entry->end = end; - prev_object = prev_entry->object.vm_object; - default_pager_convert_to_swapq(prev_object); - return (KERN_SUCCESS); + /* + * Coalesced the two objects - can extend the + * previous map entry to include the new + * range. + */ + map->size += (end - prev_entry->end); + prev_entry->end = end; + prev_object = prev_entry->object.vm_object; + default_pager_convert_to_swapq(prev_object); + return (KERN_SUCCESS); + } } } /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->is_a_map = FALSE; new_entry->is_sub_map = FALSE; new_entry->object.vm_object = object; new_entry->offset = offset; if (cow & MAP_COPY_NEEDED) new_entry->needs_copy = TRUE; else new_entry->needs_copy = FALSE; if (cow & MAP_COPY_ON_WRITE) new_entry->copy_on_write = TRUE; else new_entry->copy_on_write = FALSE; if (map->is_main_map) { new_entry->inheritance = VM_INHERIT_DEFAULT; new_entry->protection = prot; new_entry->max_protection = max; new_entry->wired_count = 0; } /* * Insert the new entry into the list */ vm_map_entry_link(map, prev_entry, new_entry); map->size += new_entry->end - new_entry->start; /* * Update the free space hint */ - if (map->first_free == prev_entry) { - if (prev_entry->end == new_entry->start) - map->first_free = new_entry; - } + if ((map->first_free == prev_entry) && + (prev_entry->end >= new_entry->start)) + map->first_free = new_entry; default_pager_convert_to_swapq(object); return (KERN_SUCCESS); } /* * Find sufficient space for `length' bytes in the given map, starting at * `start'. The map must be locked. Returns 0 on success, 1 on no space. */ int vm_map_findspace(map, start, length, addr) register vm_map_t map; register vm_offset_t start; vm_size_t length; vm_offset_t *addr; { register vm_map_entry_t entry, next; register vm_offset_t end; if (start < map->min_offset) start = map->min_offset; if (start > map->max_offset) return (1); /* * Look for the first possible address; if there's already something * at this address, we have to start after it. */ if (start == map->min_offset) { - if ((entry = map->first_free) != &map->header) { + if ((entry = map->first_free) != &map->header) start = entry->end; - } } else { vm_map_entry_t tmp; if (vm_map_lookup_entry(map, start, &tmp)) start = tmp->end; entry = tmp; } /* * Look through the rest of the map, trying to fit a new region in the * gap between existing regions, or after the very last region. */ for (;; start = (entry = next)->end) { /* * Find the end of the proposed new region. Be sure we didn't * go beyond the end of the map, or wrap around the address; * if so, we lose. Otherwise, if this is the last entry, or * if the proposed new region fits before the next entry, we * win. */ end = start + length; if (end > map->max_offset || end < start) return (1); next = entry->next; if (next == &map->header || next->start >= end) break; } SAVE_HINT(map, entry); *addr = start; if (map == kernel_map && round_page(start + length) > kernel_vm_end) pmap_growkernel(round_page(start + length)); return (0); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * */ int vm_map_find(map, object, offset, addr, length, find_space, prot, max, cow) vm_map_t map; vm_object_t object; vm_ooffset_t offset; vm_offset_t *addr; /* IN/OUT */ vm_size_t length; boolean_t find_space; vm_prot_t prot, max; int cow; { register vm_offset_t start; int result, s = 0; start = *addr; if (map == kmem_map || map == mb_map) s = splvm(); vm_map_lock(map); if (find_space) { if (vm_map_findspace(map, start, length, addr)) { vm_map_unlock(map); if (map == kmem_map || map == mb_map) splx(s); return (KERN_NO_SPACE); } start = *addr; } result = vm_map_insert(map, object, offset, start, start + length, prot, max, cow); vm_map_unlock(map); if (map == kmem_map || map == mb_map) splx(s); return (result); } -static __pure int -vm_map_simplify_okay(entry1, entry2) - vm_map_entry_t entry1, entry2; -{ - if ((entry1->end != entry2->start) || - (entry1->object.vm_object != entry2->object.vm_object)) - return 0; - if (entry1->object.vm_object) { - if (entry1->object.vm_object->behavior != - entry2->object.vm_object->behavior) - return 0; - if (entry1->offset + (entry1->end - entry1->start) != - entry2->offset) - return 0; - } - if ((entry1->needs_copy != entry2->needs_copy) || - (entry1->copy_on_write != entry2->copy_on_write) || - (entry1->protection != entry2->protection) || - (entry1->max_protection != entry2->max_protection) || - (entry1->inheritance != entry2->inheritance) || - (entry1->is_sub_map != FALSE) || - (entry1->is_a_map != FALSE) || - (entry1->wired_count != 0) || - (entry2->is_sub_map != FALSE) || - (entry2->is_a_map != FALSE) || - (entry2->wired_count != 0)) - return 0; - - return 1; -} - /* * vm_map_simplify_entry: [ internal use only ] + * + * Simplify the given map entry by: + * removing extra sharing maps + * [XXX maybe later] merging with a neighbor */ static void vm_map_simplify_entry(map, entry) vm_map_t map; vm_map_entry_t entry; { vm_map_entry_t next, prev; + vm_size_t nextsize, prevsize, esize; - if (entry->is_a_map || entry->is_sub_map || entry->wired_count) + /* + * If this entry corresponds to a sharing map, then see if we can + * remove the level of indirection. If it's not a sharing map, then it + * points to a VM object, so see if we can merge with either of our + * neighbors. + */ + + if (entry->is_sub_map || entry->is_a_map || entry->wired_count) return; prev = entry->prev; if (prev != &map->header) { - if ( vm_map_simplify_okay(prev, entry)) { + prevsize = prev->end - prev->start; + if ( (prev->end == entry->start) && + (prev->object.vm_object == entry->object.vm_object) && + (!prev->object.vm_object || (prev->object.vm_object->behavior == entry->object.vm_object->behavior)) && + (!prev->object.vm_object || + (prev->offset + prevsize == entry->offset)) && + (prev->needs_copy == entry->needs_copy) && + (prev->copy_on_write == entry->copy_on_write) && + (prev->protection == entry->protection) && + (prev->max_protection == entry->max_protection) && + (prev->inheritance == entry->inheritance) && + (prev->is_a_map == FALSE) && + (prev->is_sub_map == FALSE) && + (prev->wired_count == 0)) { if (map->first_free == prev) map->first_free = entry; if (map->hint == prev) map->hint = entry; vm_map_entry_unlink(map, prev); entry->start = prev->start; entry->offset = prev->offset; if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); vm_map_entry_dispose(map, prev); } } next = entry->next; if (next != &map->header) { - if ( vm_map_simplify_okay(entry, next)) { + nextsize = next->end - next->start; + esize = entry->end - entry->start; + if ((entry->end == next->start) && + (next->object.vm_object == entry->object.vm_object) && + (!next->object.vm_object || (next->object.vm_object->behavior == entry->object.vm_object->behavior)) && + (!entry->object.vm_object || + (entry->offset + esize == next->offset)) && + (next->needs_copy == entry->needs_copy) && + (next->copy_on_write == entry->copy_on_write) && + (next->protection == entry->protection) && + (next->max_protection == entry->max_protection) && + (next->inheritance == entry->inheritance) && + (next->is_a_map == FALSE) && + (next->is_sub_map == FALSE) && + (next->wired_count == 0)) { if (map->first_free == next) map->first_free = entry; if (map->hint == next) map->hint = entry; vm_map_entry_unlink(map, next); entry->end = next->end; if (next->object.vm_object) vm_object_deallocate(next->object.vm_object); vm_map_entry_dispose(map, next); } } } - /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(map, entry, start) register vm_map_t map; register vm_map_entry_t entry; register vm_offset_t start; { register vm_map_entry_t new_entry; /* * Split off the front portion -- note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ vm_map_simplify_entry(map, entry); new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; vm_map_entry_link(map, entry->prev, new_entry); if (entry->is_a_map || entry->is_sub_map) vm_map_reference(new_entry->object.share_map); else vm_object_reference(new_entry->object.vm_object); } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if (endaddr < entry->end) \ _vm_map_clip_end(map, entry, endaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(map, entry, end) register vm_map_t map; register vm_map_entry_t entry; register vm_offset_t end; { register vm_map_entry_t new_entry; /* * Create a new entry and insert it AFTER the specified entry */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); vm_map_entry_link(map, entry, new_entry); if (entry->is_a_map || entry->is_sub_map) vm_map_reference(new_entry->object.share_map); else vm_object_reference(new_entry->object.vm_object); } /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap(map, start, end, submap) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; vm_map_t submap; { vm_map_entry_t entry; register int result = KERN_INVALID_ARGUMENT; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && (!entry->is_a_map) && (entry->object.vm_object == NULL) && (!entry->copy_on_write)) { entry->is_a_map = FALSE; entry->is_sub_map = TRUE; vm_map_reference(entry->object.sub_map = submap); result = KERN_SUCCESS; } vm_map_unlock(map); return (result); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(map, start, end, new_prot, set_max) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register vm_prot_t new_prot; register boolean_t set_max; { register vm_map_entry_t current; vm_map_entry_t entry; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; /* * Make a first pass to check for protection violations. */ current = entry; while ((current != &map->header) && (current->start < end)) { if (current->is_sub_map) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } current = current->next; } /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ current = entry; while ((current != &map->header) && (current->start < end)) { vm_prot_t old_prot; vm_map_clip_end(map, current, end); old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * Update physical map if necessary. Worry about copy-on-write * here -- CHECK THIS XXX */ if (current->protection != old_prot) { #define MASK(entry) ((entry)->copy_on_write ? ~VM_PROT_WRITE : \ VM_PROT_ALL) #define max(a,b) ((a) > (b) ? (a) : (b)) if (current->is_a_map) { vm_map_entry_t share_entry; vm_offset_t share_end; vm_map_lock(current->object.share_map); (void) vm_map_lookup_entry( current->object.share_map, current->offset, &share_entry); share_end = current->offset + (current->end - current->start); while ((share_entry != ¤t->object.share_map->header) && (share_entry->start < share_end)) { pmap_protect(map->pmap, (max(share_entry->start, current->offset) - current->offset + current->start), min(share_entry->end, share_end) - current->offset + current->start, current->protection & MASK(share_entry)); share_entry = share_entry->next; } vm_map_unlock(current->object.share_map); } else pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(entry)); #undef max #undef MASK } current = current->next; } vm_map_simplify_entry(map, entry); vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_madvise: * * This routine traverses a processes map handling the madvise * system call. */ void vm_map_madvise(map, pmap, start, end, advise) vm_map_t map; pmap_t pmap; vm_offset_t start, end; int advise; { register vm_map_entry_t current; vm_map_entry_t entry; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; for(current = entry; (current != &map->header) && (current->start < end); current = current->next) { if (current->is_a_map || current->is_sub_map) { continue; } vm_map_clip_end(map, current, end); switch (advise) { case MADV_NORMAL: current->object.vm_object->behavior = OBJ_NORMAL; break; case MADV_SEQUENTIAL: current->object.vm_object->behavior = OBJ_SEQUENTIAL; break; case MADV_RANDOM: current->object.vm_object->behavior = OBJ_RANDOM; break; /* * Right now, we could handle DONTNEED and WILLNEED with common code. * They are mostly the same, except for the potential async reads (NYI). */ case MADV_FREE: case MADV_DONTNEED: { vm_pindex_t pindex; int count; vm_size_t size = entry->end - entry->start; pindex = OFF_TO_IDX(entry->offset); count = OFF_TO_IDX(size); /* * MADV_DONTNEED removes the page from all * pmaps, so pmap_remove is not necessary. */ vm_object_madvise(current->object.vm_object, pindex, count, advise); } break; case MADV_WILLNEED: { vm_pindex_t pindex; int count; vm_size_t size = entry->end - entry->start; pindex = OFF_TO_IDX(entry->offset); count = OFF_TO_IDX(size); vm_object_madvise(current->object.vm_object, pindex, count, advise); pmap_object_init_pt(pmap, current->start, current->object.vm_object, pindex, (count << PAGE_SHIFT), 0); } break; default: break; } } vm_map_simplify_entry(map, entry); vm_map_unlock(map); return; } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vm_map_fork. */ int vm_map_inherit(map, start, end, new_inheritance) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register vm_inherit_t new_inheritance; { register vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: break; default: return (KERN_INVALID_ARGUMENT); } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->inheritance = new_inheritance; entry = entry->next; } vm_map_simplify_entry(map, temp_entry); vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_pageable: * * Sets the pageability of the specified address * range in the target map. Regions specified * as not pageable require locked-down physical * memory and physical page maps. * * The map must not be locked, but a reference * must remain to the map throughout the call. */ int vm_map_pageable(map, start, end, new_pageable) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register boolean_t new_pageable; { register vm_map_entry_t entry; vm_map_entry_t start_entry; register vm_offset_t failed = 0; int rv; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); /* * Only one pageability change may take place at one time, since * vm_fault assumes it will be called only once for each * wiring/unwiring. Therefore, we have to make sure we're actually * changing the pageability for the entire region. We do so before * making any changes. */ if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } entry = start_entry; /* * Actions are rather different for wiring and unwiring, so we have * two separate cases. */ if (new_pageable) { vm_map_clip_start(map, entry, start); /* * Unwiring. First ensure that the range to be unwired is * really wired down and that there are no holes. */ while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0 || (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } entry = entry->next; } /* * Now decrement the wiring count for each region. If a region * becomes completely unwired, unwire its physical pages and * mappings. */ lock_set_recursive(&map->lock); entry = start_entry; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->wired_count--; if (entry->wired_count == 0) vm_fault_unwire(map, entry->start, entry->end); entry = entry->next; } vm_map_simplify_entry(map, start_entry); lock_clear_recursive(&map->lock); } else { /* * Wiring. We must do this in two passes: * * 1. Holding the write lock, we create any shadow or zero-fill * objects that need to be created. Then we clip each map * entry to the region to be wired and increment its wiring * count. We create objects before clipping the map entries * to avoid object proliferation. * * 2. We downgrade to a read lock, and call vm_fault_wire to * fault in the pages for any newly wired area (wired_count is * 1). * * Downgrading to a read lock for vm_fault_wire avoids a possible * deadlock with another process that may have faulted on one * of the pages to be wired (it would mark the page busy, * blocking us, then in turn block on the map lock that we * hold). Because of problems in the recursive lock package, * we cannot upgrade to a write lock in vm_map_lookup. Thus, * any actions that require the write lock must be done * beforehand. Because we keep the read lock on the map, the * copy-on-write status of the entries we modify here cannot * change. */ /* * Pass 1. */ while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0) { /* * Perform actions of vm_map_lookup that need * the write lock on the map: create a shadow * object for a copy-on-write region, or an * object for a zero-fill region. * * We don't have to do this for entries that * point to sharing maps, because we won't * hold the lock on the sharing map. */ if (!entry->is_a_map && !entry->is_sub_map) { int copyflag = entry->needs_copy; if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) { vm_object_shadow(&entry->object.vm_object, &entry->offset, OFF_TO_IDX(entry->end - entry->start)); entry->needs_copy = FALSE; } else if (entry->object.vm_object == NULL) { entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(entry->end - entry->start)); entry->offset = (vm_offset_t) 0; } default_pager_convert_to_swapq(entry->object.vm_object); } } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); entry->wired_count++; /* * Check for holes */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { /* * Found one. Object creation actions do not * need to be undone, but the wired counts * need to be restored. */ while (entry != &map->header && entry->end > start) { entry->wired_count--; entry = entry->prev; } vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } entry = entry->next; } /* * Pass 2. */ /* * HACK HACK HACK HACK * * If we are wiring in the kernel map or a submap of it, * unlock the map to avoid deadlocks. We trust that the * kernel is well-behaved, and therefore will not do * anything destructive to this region of the map while * we have it unlocked. We cannot trust user processes * to do the same. * * HACK HACK HACK HACK */ if (vm_map_pmap(map) == kernel_pmap) { vm_map_unlock(map); /* trust me ... */ } else { lock_set_recursive(&map->lock); lock_write_to_read(&map->lock); } rv = 0; entry = start_entry; while (entry != &map->header && entry->start < end) { /* * If vm_fault_wire fails for any page we need to undo * what has been done. We decrement the wiring count * for those pages which have not yet been wired (now) * and unwire those that have (later). * * XXX this violates the locking protocol on the map, * needs to be fixed. */ if (rv) entry->wired_count--; else if (entry->wired_count == 1) { rv = vm_fault_wire(map, entry->start, entry->end); if (rv) { failed = entry->start; entry->wired_count--; } } entry = entry->next; } if (vm_map_pmap(map) == kernel_pmap) { vm_map_lock(map); } else { lock_clear_recursive(&map->lock); } if (rv) { vm_map_unlock(map); (void) vm_map_pageable(map, start, failed, TRUE); return (rv); } } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_clean * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_clean(map, start, end, syncio, invalidate) vm_map_t map; vm_offset_t start; vm_offset_t end; boolean_t syncio; boolean_t invalidate; { register vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } /* * Make a first pass to check for holes. */ for (current = entry; current->start < end; current = current->next) { if (current->is_sub_map) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && (current->next == &map->header || current->end != current->next->start)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current->start < end; current = current->next) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->is_a_map || current->is_sub_map) { register vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.share_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * anyway, for semantic correctness. */ while (object->backing_object) { object = object->backing_object; offset += object->backing_object_offset; if (object->size < OFF_TO_IDX( offset + size)) size = IDX_TO_OFF(object->size) - offset; } if (invalidate) pmap_remove(vm_map_pmap(map), current->start, current->start + size); if (object && (object->type == OBJT_VNODE)) { /* * Flush pages if writing is allowed. XXX should we continue * on an error? * * XXX Doing async I/O and then removing all the pages from * the object before it completes is probably a very bad * idea. */ if (current->protection & VM_PROT_WRITE) { vm_object_page_clean(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size), (syncio||invalidate)?1:0, TRUE); if (invalidate) vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size), FALSE); } } start += size; } vm_map_unlock_read(map); return (KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static __inline void vm_map_entry_unwire(map, entry) vm_map_t map; register vm_map_entry_t entry; { vm_fault_unwire(map, entry->start, entry->end); entry->wired_count = 0; } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static __inline void vm_map_entry_delete(map, entry) register vm_map_t map; register vm_map_entry_t entry; { vm_map_entry_unlink(map, entry); map->size -= entry->end - entry->start; if (entry->is_a_map || entry->is_sub_map) { vm_map_deallocate(entry->object.share_map); } else { vm_object_deallocate(entry->object.vm_object); } vm_map_entry_dispose(map, entry); } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. * * When called with a sharing map, removes pages from * that region from all physical maps. */ int vm_map_delete(map, start, end) register vm_map_t map; vm_offset_t start; register vm_offset_t end; { register vm_map_entry_t entry; vm_map_entry_t first_entry; /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { entry = first_entry; vm_map_clip_start(map, entry, start); /* * Fix the lookup hint now, rather than each time though the * loop. */ SAVE_HINT(map, entry->prev); } /* * Save the free space hint */ if (entry == &map->header) { map->first_free = &map->header; } else if (map->first_free->start >= start) map->first_free = entry->prev; /* * Step through all entries in this region */ while ((entry != &map->header) && (entry->start < end)) { vm_map_entry_t next; vm_offset_t s, e; vm_object_t object; vm_ooffset_t offset; vm_map_clip_end(map, entry, end); next = entry->next; s = entry->start; e = entry->end; offset = entry->offset; /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ object = entry->object.vm_object; if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); /* * If this is a sharing map, we must remove *all* references * to this data, since we can't find all of the physical maps * which are sharing it. */ if (object == kernel_object || object == kmem_object) { vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + (e - s)), FALSE); } else if (!map->is_main_map) { vm_object_pmap_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + (e - s))); } else { pmap_remove(map->pmap, s, e); } /* * Delete the entry (which may delete the object) only after * removing all pmap entries pointing to its pages. * (Otherwise, its page frames may be reallocated, and any * modify bits will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(map, start, end) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; { register int result, s = 0; if (map == kmem_map || map == mb_map) s = splvm(); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); if (map == kmem_map || map == mb_map) splx(s); return (result); } /* - * vm_map_remove_userspace: - * Removes the user portion of the address space. - */ -void -vm_map_remove_userspace(map) - register vm_map_t map; -{ - vm_map_lock(map); - pmap_remove_pages(map->pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); - vm_map_delete(map, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); - vm_map_unlock(map); - return; -} - -/* * vm_map_check_protection: * * Assert that the target map allows the specified * privilege on the entire address region given. * The entire region must be allocated. */ boolean_t vm_map_check_protection(map, start, end, protection) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register vm_prot_t protection; { register vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) { return (FALSE); } entry = tmp_entry; while (start < end) { if (entry == &map->header) { return (FALSE); } /* * No holes allowed! */ if (start < entry->start) { return (FALSE); } /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) { return (FALSE); } /* go to next entry */ start = entry->end; entry = entry->next; } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry(src_map, dst_map, src_entry, dst_entry) vm_map_t src_map, dst_map; register vm_map_entry_t src_entry, dst_entry; { if (src_entry->is_sub_map || dst_entry->is_sub_map) return; if (src_entry->wired_count == 0) { /* * If the source entry is marked needs_copy, it is already * write-protected. */ if (!src_entry->needs_copy) { boolean_t su; /* * If the source entry has only one mapping, we can * just protect the virtual address range. */ if (!(su = src_map->is_main_map)) { su = (src_map->ref_count == 1); } if (su) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } else { vm_object_pmap_copy(src_entry->object.vm_object, OFF_TO_IDX(src_entry->offset), OFF_TO_IDX(src_entry->offset + (src_entry->end - src_entry->start))); } } /* * Make a copy of the object. */ if (src_entry->object.vm_object) { if ((src_entry->object.vm_object->handle == NULL) && (src_entry->object.vm_object->type == OBJT_DEFAULT || src_entry->object.vm_object->type == OBJT_SWAP)) vm_object_collapse(src_entry->object.vm_object); ++src_entry->object.vm_object->ref_count; src_entry->copy_on_write = TRUE; src_entry->needs_copy = TRUE; dst_entry->needs_copy = TRUE; dst_entry->copy_on_write = TRUE; dst_entry->object.vm_object = src_entry->object.vm_object; dst_entry->offset = src_entry->offset; } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; } pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { /* * Of course, wired down pages can't be set copy-on-write. * Cause wired pages to be copied into the new map by * simulating faults (the new pages are pageable) */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * The source map must not be locked. */ struct vmspace * vmspace_fork(vm1) register struct vmspace *vm1; { register struct vmspace *vm2; vm_map_t old_map = &vm1->vm_map; vm_map_t new_map; vm_map_entry_t old_entry; vm_map_entry_t new_entry; pmap_t new_pmap; vm_object_t object; vm_map_lock(old_map); vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, old_map->entries_pageable); bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); new_pmap = &vm2->vm_pmap; /* XXX */ new_map = &vm2->vm_map; /* XXX */ old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->is_sub_map) panic("vm_map_fork: encountered a submap"); switch (old_entry->inheritance) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, referencing the sharing map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->wired_count = 0; object = new_entry->object.vm_object; ++object->ref_count; /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_map->header.prev, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->is_a_map = FALSE; vm_map_entry_link(new_map, new_map->header.prev, new_entry); vm_map_copy_entry(old_map, new_map, old_entry, new_entry); break; } old_entry = old_entry->next; } new_map->size = old_map->size; vm_map_unlock(old_map); return (vm2); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(var_map, vaddr, fault_type, out_entry, object, pindex, out_prot, wired, single_use) vm_map_t *var_map; /* IN/OUT */ register vm_offset_t vaddr; register vm_prot_t fault_type; vm_map_entry_t *out_entry; /* OUT */ vm_object_t *object; /* OUT */ vm_pindex_t *pindex; /* OUT */ vm_prot_t *out_prot; /* OUT */ boolean_t *wired; /* OUT */ boolean_t *single_use; /* OUT */ { vm_map_t share_map; vm_offset_t share_offset; register vm_map_entry_t entry; register vm_map_t map = *var_map; register vm_prot_t prot; register boolean_t su; RetryLookup:; /* * Lookup the faulting address. */ vm_map_lock_read(map); #define RETURN(why) \ { \ vm_map_unlock_read(map); \ return(why); \ } /* * If the map has an interesting hint, try it before calling full * blown lookup routine. */ entry = map->hint; *out_entry = entry; if ((entry == &map->header) || (vaddr < entry->start) || (vaddr >= entry->end)) { vm_map_entry_t tmp_entry; /* * Entry was either not a valid hint, or the vaddr was not * contained in the entry, so do a full lookup. */ if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) RETURN(KERN_INVALID_ADDRESS); entry = tmp_entry; *out_entry = entry; } /* * Handle submaps. */ if (entry->is_sub_map) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; if ((fault_type & (prot)) != fault_type) RETURN(KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) prot = fault_type = entry->protection; /* * If we don't already have a VM object, track it down. */ su = !entry->is_a_map; if (su) { share_map = map; share_offset = vaddr; } else { vm_map_entry_t share_entry; /* * Compute the sharing map, and offset into it. */ share_map = entry->object.share_map; share_offset = (vaddr - entry->start) + entry->offset; /* * Look for the backing store object and offset */ vm_map_lock_read(share_map); if (!vm_map_lookup_entry(share_map, share_offset, &share_entry)) { vm_map_unlock_read(share_map); RETURN(KERN_INVALID_ADDRESS); } entry = share_entry; } /* * If the entry was copy-on-write, we either ... */ if (entry->needs_copy) { /* * If we want to write the page, we may as well handle that * now since we've got the sharing map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if (fault_type & VM_PROT_WRITE) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the share map to the new * object. */ if (lock_read_to_write(&share_map->lock)) { if (share_map != map) vm_map_unlock_read(map); goto RetryLookup; } vm_object_shadow( &entry->object.vm_object, &entry->offset, OFF_TO_IDX(entry->end - entry->start)); entry->needs_copy = FALSE; lock_write_to_read(&share_map->lock); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= (~VM_PROT_WRITE); } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL) { if (lock_read_to_write(&share_map->lock)) { if (share_map != map) vm_map_unlock_read(map); goto RetryLookup; } entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(entry->end - entry->start)); entry->offset = 0; lock_write_to_read(&share_map->lock); } - default_pager_convert_to_swapq(entry->object.vm_object); - + if (entry->object.vm_object != NULL) + default_pager_convert_to_swapq(entry->object.vm_object); /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((share_offset - entry->start) + entry->offset); *object = entry->object.vm_object; /* * Return whether this is the only map sharing this data. */ if (!su) { su = (share_map->ref_count == 1); } *out_prot = prot; *single_use = su; return (KERN_SUCCESS); #undef RETURN } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(map, entry) register vm_map_t map; vm_map_entry_t entry; { /* * If this entry references a map, unlock it first. */ if (entry->is_a_map) vm_map_unlock_read(entry->object.share_map); /* * Unlock the main-level map */ vm_map_unlock_read(map); } /* * Routine: vm_map_simplify * Purpose: * Attempt to simplify the map representation in * the vicinity of the given starting address. * Note: * This routine is intended primarily to keep the * kernel maps more compact -- they generally don't * benefit from the "expand a map entry" technology * at allocation time because the adjacent entry * is often wired down. */ void vm_map_simplify(map, start) vm_map_t map; vm_offset_t start; { vm_map_entry_t this_entry; vm_map_entry_t prev_entry; vm_map_lock(map); if ((vm_map_lookup_entry(map, start, &this_entry)) && ((prev_entry = this_entry->prev) != &map->header) && (prev_entry->end == start) && (prev_entry->object.vm_object == this_entry->object.vm_object) && ((prev_entry->offset + (prev_entry->end - prev_entry->start)) == this_entry->offset) && (map->is_main_map) && (prev_entry->is_a_map == FALSE) && (prev_entry->is_sub_map == FALSE) && (this_entry->is_a_map == FALSE) && (this_entry->is_sub_map == FALSE) && (prev_entry->inheritance == this_entry->inheritance) && (prev_entry->protection == this_entry->protection) && (prev_entry->max_protection == this_entry->max_protection) && (prev_entry->wired_count == this_entry->wired_count) && (prev_entry->copy_on_write == this_entry->copy_on_write) && (prev_entry->needs_copy == this_entry->needs_copy)) { if (map->first_free == this_entry) map->first_free = prev_entry; if (map->hint == this_entry) SAVE_HINT(map, prev_entry); vm_map_entry_unlink(map, this_entry); prev_entry->end = this_entry->end; if (this_entry->object.vm_object) vm_object_deallocate(this_entry->object.vm_object); vm_map_entry_dispose(map, this_entry); } vm_map_unlock(map); } #ifdef DDB /* * vm_map_print: [ debug ] */ void vm_map_print(imap, full, dummy3, dummy4) /* db_expr_t */ int imap; boolean_t full; /* db_expr_t */ int dummy3; char *dummy4; { register vm_map_entry_t entry; register vm_map_t map = (vm_map_t)imap; /* XXX */ iprintf("%s map 0x%x: pmap=0x%x,ref=%d,nentries=%d,version=%d\n", (map->is_main_map ? "Task" : "Share"), (int) map, (int) (map->pmap), map->ref_count, map->nentries, map->timestamp); if (!full && indent) return; indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { iprintf("map entry 0x%x: start=0x%x, end=0x%x, ", (int) entry, (int) entry->start, (int) entry->end); if (map->is_main_map) { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; printf("prot=%x/%x/%s, ", entry->protection, entry->max_protection, inheritance_name[entry->inheritance]); if (entry->wired_count != 0) printf("wired, "); } if (entry->is_a_map || entry->is_sub_map) { printf("share=0x%x, offset=0x%x\n", (int) entry->object.share_map, (int) entry->offset); if ((entry->prev == &map->header) || (!entry->prev->is_a_map) || (entry->prev->object.share_map != entry->object.share_map)) { indent += 2; vm_map_print((int)entry->object.share_map, full, 0, (char *)0); indent -= 2; } } else { printf("object=0x%x, offset=0x%x", (int) entry->object.vm_object, (int) entry->offset); if (entry->copy_on_write) printf(", copy (%s)", entry->needs_copy ? "needed" : "done"); printf("\n"); if ((entry->prev == &map->header) || (entry->prev->is_a_map) || (entry->prev->object.vm_object != entry->object.vm_object)) { indent += 2; vm_object_print((int)entry->object.vm_object, full, 0, (char *)0); indent -= 2; } } } indent -= 2; } #endif Index: head/sys/vm/vm_map.h =================================================================== --- head/sys/vm/vm_map.h (revision 17333) +++ head/sys/vm/vm_map.h (revision 17334) @@ -1,243 +1,242 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.h 8.3 (Berkeley) 3/15/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_map.h,v 1.13 1996/05/19 07:36:48 dyson Exp $ + * $Id: vm_map.h,v 1.14 1996/07/27 03:23:59 dyson Exp $ */ /* * Virtual memory map module definitions. */ #ifndef _VM_MAP_ #define _VM_MAP_ /* * Types defined: * * vm_map_t the high-level address map data structure. * vm_map_entry_t an entry in an address map. * vm_map_version_t a timestamp of a map, for use with vm_map_lookup */ /* * Objects which live in maps may be either VM objects, or * another map (called a "sharing map") which denotes read-write * sharing with other maps. */ union vm_map_object { struct vm_object *vm_object; /* object object */ struct vm_map *share_map; /* share map */ struct vm_map *sub_map; /* belongs to another map */ }; /* * Address map entries consist of start and end addresses, * a VM object (or sharing map) and offset into that object, * and user-exported inheritance and protection information. * Also included is control information for virtual copy operations. */ struct vm_map_entry { struct vm_map_entry *prev; /* previous entry */ struct vm_map_entry *next; /* next entry */ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ union vm_map_object object; /* object I point to */ vm_ooffset_t offset; /* offset into object */ boolean_t is_a_map:1, /* Is "object" a map? */ is_sub_map:1, /* Is "object" a submap? */ /* Only in sharing maps: */ copy_on_write:1, /* is data copy-on-write */ needs_copy:1; /* does object need to be copied */ /* Only in task maps: */ vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ }; /* * Maps are doubly-linked lists of map entries, kept sorted * by address. A single hint is provided to start * searches again from the last successful search, * insertion, or removal. */ struct vm_map { struct pmap *pmap; /* Physical map */ lock_data_t lock; /* Lock for map data */ struct vm_map_entry header; /* List of entries */ int nentries; /* Number of entries */ vm_size_t size; /* virtual size */ boolean_t is_main_map; /* Am I a main map? */ int ref_count; /* Reference count */ vm_map_entry_t hint; /* hint for quick lookups */ vm_map_entry_t first_free; /* First free space hint */ boolean_t entries_pageable; /* map entries pageable?? */ unsigned int timestamp; /* Version number */ #define min_offset header.start #define max_offset header.end }; /* * Shareable process virtual address space. * May eventually be merged with vm_map. * Several fields are temporary (text, data stuff). */ struct vmspace { struct vm_map vm_map; /* VM address map */ struct pmap vm_pmap; /* private physical map */ int vm_refcnt; /* number of references */ caddr_t vm_shm; /* SYS5 shared memory private data XXX */ vm_object_t vm_upages_obj; /* UPAGES object */ /* we copy from vm_startcopy to the end of the structure on fork */ #define vm_startcopy vm_rssize segsz_t vm_rssize; /* current resident set size in pages */ segsz_t vm_swrss; /* resident set size before last swap */ segsz_t vm_tsize; /* text size (pages) XXX */ segsz_t vm_dsize; /* data size (pages) XXX */ segsz_t vm_ssize; /* stack size (pages) */ caddr_t vm_taddr; /* user virtual address of text XXX */ caddr_t vm_daddr; /* user virtual address of data XXX */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ caddr_t vm_minsaddr; /* user VA at max stack growth */ }; /* * Map versions are used to validate a previous lookup attempt. * * Since lookup operations may involve both a main map and * a sharing map, it is necessary to have a timestamp from each. * [If the main map timestamp has changed, the share_map and * associated timestamp are no longer valid; the map version * does not include a reference for the embedded share_map.] */ typedef struct { int main_timestamp; vm_map_t share_map; int share_timestamp; } vm_map_version_t; /* * Macros: vm_map_lock, etc. * Function: * Perform locking on the data portion of a map. */ #define vm_map_lock(map) { \ lock_write(&(map)->lock); \ (map)->timestamp++; \ } #define vm_map_unlock(map) lock_write_done(&(map)->lock) #define vm_map_lock_read(map) lock_read(&(map)->lock) #define vm_map_unlock_read(map) lock_read_done(&(map)->lock) /* * Functions implemented as macros */ #define vm_map_min(map) ((map)->min_offset) #define vm_map_max(map) ((map)->max_offset) #define vm_map_pmap(map) ((map)->pmap) /* XXX: number of kernel maps and entries to statically allocate */ #define MAX_KMAP 10 #define MAX_KMAPENT 128 /* * Copy-on-write flags for vm_map operations */ #define MAP_COPY_NEEDED 0x1 #define MAP_COPY_ON_WRITE 0x2 #ifdef KERNEL extern vm_offset_t kentry_data; extern vm_size_t kentry_data_size; boolean_t vm_map_check_protection __P((vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t)); int vm_map_copy __P((vm_map_t, vm_map_t, vm_offset_t, vm_size_t, vm_offset_t, boolean_t, boolean_t)); struct pmap; vm_map_t vm_map_create __P((struct pmap *, vm_offset_t, vm_offset_t, boolean_t)); void vm_map_deallocate __P((vm_map_t)); int vm_map_delete __P((vm_map_t, vm_offset_t, vm_offset_t)); int vm_map_find __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int)); int vm_map_findspace __P((vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *)); int vm_map_inherit __P((vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t)); void vm_map_init __P((struct vm_map *, vm_offset_t, vm_offset_t, boolean_t)); int vm_map_insert __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int)); int vm_map_lookup __P((vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *, boolean_t *)); void vm_map_lookup_done __P((vm_map_t, vm_map_entry_t)); boolean_t vm_map_lookup_entry __P((vm_map_t, vm_offset_t, vm_map_entry_t *)); int vm_map_pageable __P((vm_map_t, vm_offset_t, vm_offset_t, boolean_t)); int vm_map_clean __P((vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t)); int vm_map_protect __P((vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); void vm_map_reference __P((vm_map_t)); int vm_map_remove __P((vm_map_t, vm_offset_t, vm_offset_t)); -void vm_map_remove_userspace __P((vm_map_t)); void vm_map_simplify __P((vm_map_t, vm_offset_t)); void vm_map_startup __P((void)); int vm_map_submap __P((vm_map_t, vm_offset_t, vm_offset_t, vm_map_t)); void vm_map_madvise __P((vm_map_t, pmap_t, vm_offset_t, vm_offset_t, int)); #endif #endif /* _VM_MAP_ */ Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 17333) +++ head/sys/vm/vm_mmap.c (revision 17334) @@ -1,1150 +1,987 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 - * $Id: vm_mmap.c,v 1.47 1996/07/27 17:21:41 dyson Exp $ + * $Id: vm_mmap.c,v 1.48 1996/07/28 02:54:09 davidg Exp $ */ /* * Mapped file (mmap) interface to VM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif /* ARGSUSED */ int sbrk(p, uap, retval) struct proc *p; struct sbrk_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif /* ARGSUSED */ int sstk(p, uap, retval) struct proc *p; struct sstk_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif /* ARGSUSED */ int ogetpagesize(p, uap, retval) struct proc *p; struct getpagesize_args *uap; int *retval; { *retval = PAGE_SIZE; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { caddr_t addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int mmap(p, uap, retval) struct proc *p; register struct mmap_args *uap; int *retval; { register struct filedesc *fdp = p->p_fd; register struct file *fp; struct vnode *vp; vm_offset_t addr; vm_size_t size, pageoff; vm_prot_t prot, maxprot; caddr_t handle; int flags, error; prot = uap->prot & VM_PROT_ALL; flags = uap->flags; /* * Address (if FIXED) must be page aligned. Size is implicitly rounded * to a page boundary. */ addr = (vm_offset_t) uap->addr; if (((flags & MAP_FIXED) && (addr & PAGE_MASK)) || (ssize_t) uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) return (EINVAL); /* * Round page if not already disallowed by above test * XXX: Is there any point in the MAP_FIXED align requirement above? */ size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (addr + size < addr) return (EINVAL); } /* * XXX if no hint provided for a non-fixed mapping place it after the * end of the largest possible heap. * * There should really be a pmap call to determine a reasonable location. */ if (addr == 0 && (flags & MAP_FIXED) == 0) addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; } else { /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ if (((unsigned) uap->fd) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) return (EINVAL); vp = (struct vnode *) fp->f_data; if (vp->v_type != VREG && vp->v_type != VCHR) return (EINVAL); /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; } else { /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) return (EACCES); if (flags & MAP_SHARED) { if (fp->f_flag & FWRITE) maxprot |= VM_PROT_WRITE; else if (prot & PROT_WRITE) return (EACCES); } else maxprot |= VM_PROT_WRITE; handle = (caddr_t) vp; } } error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, handle, uap->pos); if (error == 0) *retval = (int) addr; return (error); } #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(p, uap, retval) struct proc *p; register struct ommap_args *uap; int *retval; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 #define OMAP_INHERIT 0x0800 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot & 0x7]; nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; if (uap->flags & OMAP_INHERIT) nargs.flags |= MAP_INHERIT; nargs.fd = uap->fd; nargs.pos = uap->pos; return (mmap(p, &nargs, retval)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { caddr_t addr; int len; int flags; }; #endif int msync(p, uap, retval) struct proc *p; struct msync_args *uap; int *retval; { vm_offset_t addr; vm_size_t size, pageoff; int flags; vm_map_t map; int rv; addr = (vm_offset_t) uap->addr; size = uap->len; flags = uap->flags; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &p->p_vmspace->vm_map; /* * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we don't * really keep track of individual mmaps so we approximate by flushing * the range of the map entry containing addr. This can be incorrect * if the region splits or is coalesced with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = vm_map_lookup_entry(map, addr, &entry); vm_map_unlock_read(map); if (rv == FALSE) return (EINVAL); addr = entry->start; size = entry->end - entry->start; } /* * Clean the pages and interpret the return value. */ rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: break; case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_FAILURE: return (EIO); default: return (EINVAL); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { caddr_t addr; size_t len; }; #endif int munmap(p, uap, retval) register struct proc *p; register struct munmap_args *uap; int *retval; { vm_offset_t addr; vm_size_t size, pageoff; vm_map_t map; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); if (size == 0) return (0); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (addr + size < addr) return (EINVAL); map = &p->p_vmspace->vm_map; /* * Make sure entire range is allocated. */ if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) return (EINVAL); /* returns nothing but KERN_SUCCESS anyway */ (void) vm_map_remove(map, addr, addr + size); return (0); } void munmapfd(p, fd) struct proc *p; int fd; { /* * XXX should unmap any regions mapped to this file */ p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { caddr_t addr; size_t len; int prot; }; #endif int mprotect(p, uap, retval) struct proc *p; struct mprotect_args *uap; int *retval; { vm_offset_t addr; vm_size_t size, pageoff; register vm_prot_t prot; addr = (vm_offset_t) uap->addr; size = uap->len; prot = uap->prot & VM_PROT_ALL; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { caddr_t addr; size_t len; int inherit; }; #endif int minherit(p, uap, retval) struct proc *p; struct minherit_args *uap; int *retval; { vm_offset_t addr; vm_size_t size, pageoff; register vm_inherit_t inherit; addr = (vm_offset_t)uap->addr; size = uap->len; inherit = uap->inherit; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return(EINVAL); switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { caddr_t addr; size_t len; int behav; }; #endif /* ARGSUSED */ int madvise(p, uap, retval) struct proc *p; struct madvise_args *uap; int *retval; { vm_map_t map; pmap_t pmap; vm_offset_t start, end; /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = round_page((vm_offset_t) uap->addr); end = trunc_page((vm_offset_t) uap->addr + uap->len); map = &p->p_vmspace->vm_map; pmap = &p->p_vmspace->vm_pmap; vm_map_madvise(map, pmap, start, end, uap->behav); /* Not yet implemented */ return (0); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { caddr_t addr; size_t len; char *vec; }; #endif /* ARGSUSED */ int mincore(p, uap, retval) struct proc *p; struct mincore_args *uap; int *retval; { vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error; int vecindex, lastvecindex; register vm_map_entry_t current; vm_map_entry_t entry; int mincoreinfo; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) return (EINVAL); if (end < addr) return (EINVAL); /* * Address of byte vector */ vec = uap->vec; map = &p->p_vmspace->vm_map; pmap = &p->p_vmspace->vm_pmap; vm_map_lock(map); /* * Not needed here */ #if 0 VM_MAP_RANGE_CHECK(map, addr, end); #endif if (!vm_map_lookup_entry(map, addr, &entry)) entry = entry->next; /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for(current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * ignore submaps (for now) or null objects */ if (current->is_a_map || current->is_sub_map || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while(addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ mincoreinfo = pmap_mincore(pmap, addr); if (!mincoreinfo) { vm_pindex_t pindex; vm_ooffset_t offset; vm_page_t m; /* * calculate the page index into the object */ offset = current->offset + (addr - current->start); pindex = OFF_TO_IDX(offset); m = vm_page_lookup(current->object.vm_object, pindex); /* * if the page is resident, then gather information about * it. */ if (m) { mincoreinfo = MINCORE_INCORE; if (m->dirty || - pmap_tc_modified(m)) + pmap_is_modified(VM_PAGE_TO_PHYS(m))) mincoreinfo |= MINCORE_MODIFIED_OTHER; if ((m->flags & PG_REFERENCED) || - pmap_tc_referenced(VM_PAGE_TO_PHYS(m))) + pmap_is_referenced(VM_PAGE_TO_PHYS(m))) mincoreinfo |= MINCORE_REFERENCED_OTHER; } } /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { vm_map_unlock(map); return (EFAULT); } ++lastvecindex; } /* * Pass the page information to the user */ error = subyte( vec + vecindex, mincoreinfo); if (error) { vm_map_unlock(map); return (EFAULT); } lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { vm_map_unlock(map); return (EFAULT); } ++lastvecindex; } vm_map_unlock(map); return (0); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { caddr_t addr; size_t len; }; #endif int mlock(p, uap, retval) struct proc *p; struct mlock_args *uap; int *retval; { vm_offset_t addr; vm_size_t size, pageoff; int error; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* disable wrap around */ if (addr + size < addr) return (EINVAL); if (atop(size) + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef pmap_wired_count if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return (EAGAIN); #else error = suser(p->p_ucred, &p->p_acflag); if (error) return (error); #endif error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr + size, FALSE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { caddr_t addr; size_t len; }; #endif int munlock(p, uap, retval) struct proc *p; struct munlock_args *uap; int *retval; { vm_offset_t addr; vm_size_t size, pageoff; int error; addr = (vm_offset_t) uap->addr; size = uap->len; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); /* disable wrap around */ if (addr + size < addr) return (EINVAL); #ifndef pmap_wired_count error = suser(p->p_ucred, &p->p_acflag); if (error) return (error); #endif error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr + size, TRUE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * Internal version of mmap. * Currently used by mmap, exec, and sys5 shared memory. * Handle is either a vnode pointer or NULL for MAP_ANON. */ int vm_mmap(map, addr, size, prot, maxprot, flags, handle, foff) register vm_map_t map; register vm_offset_t *addr; register vm_size_t size; vm_prot_t prot, maxprot; register int flags; caddr_t handle; /* XXX should be vp */ vm_ooffset_t foff; { boolean_t fitit; - vm_object_t object; + vm_object_t object, object2; struct vnode *vp = NULL; objtype_t type; int rv = KERN_SUCCESS; vm_ooffset_t objsize; int docow; struct proc *p = curproc; if (size == 0) return (0); objsize = size = round_page(size); /* * We currently can only deal with page aligned file offsets. * The check is here rather than in the syscall because the * kernel calls this function internally for other mmaping * operations (such as in exec) and non-aligned offsets will * cause pmap inconsistencies...so we want to be sure to * disallow this in all cases. */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; (void) vm_map_remove(map, *addr, *addr + size); } /* * Lookup/allocate object. */ if (flags & MAP_ANON) { + type = OBJT_SWAP; /* * Unnamed anonymous regions always start at 0. */ - if (handle == 0) { + if (handle == 0) foff = 0; - type = OBJT_DEFAULT; - } else { - type = OBJT_SWAP; - } } else { vp = (struct vnode *) handle; if (vp->v_type == VCHR) { type = OBJT_DEVICE; handle = (caddr_t) vp->v_rdev; } else { struct vattr vat; int error; error = VOP_GETATTR(vp, &vat, p->p_ucred, p); if (error) return (error); objsize = round_page(vat.va_size); type = OBJT_VNODE; } } + object = vm_pager_allocate(type, handle, OFF_TO_IDX(objsize), prot, foff); + if (object == NULL) + return (type == OBJT_DEVICE ? EINVAL : ENOMEM); - if (type != OBJT_DEFAULT) { - object = vm_pager_allocate(type, handle, - OFF_TO_IDX(objsize), prot, foff); - if (object == NULL) - return (type == OBJT_DEVICE ? EINVAL : ENOMEM); - } else { - object = NULL; - } - /* * Force device mappings to be shared. */ if (type == OBJT_DEVICE) { flags &= ~(MAP_PRIVATE|MAP_COPY); flags |= MAP_SHARED; } + object2 = NULL; docow = 0; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) { - docow = MAP_COPY_ON_WRITE|MAP_COPY_NEEDED; + docow = MAP_COPY_ON_WRITE; + if (objsize < size) { + object2 = vm_object_allocate( OBJT_DEFAULT, + OFF_TO_IDX(size - (foff & ~PAGE_MASK))); + object2->backing_object = object; + object2->backing_object_offset = foff; + TAILQ_INSERT_TAIL(&object->shadow_head, + object2, shadow_list); + ++object->shadow_count; + } else { + docow |= MAP_COPY_NEEDED; + } } - rv = vm_map_find(map, object, foff, addr, size, fitit, - prot, maxprot, docow); + if (object2) + rv = vm_map_find(map, object2, 0, addr, size, fitit, + prot, maxprot, docow); + else + rv = vm_map_find(map, object, foff, addr, size, fitit, + prot, maxprot, docow); + if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. */ - vm_object_deallocate(object); + if (object2) + vm_object_deallocate(object2); + else + vm_object_deallocate(object); goto out; } /* * "Pre-fault" resident pages. */ if ((type == OBJT_VNODE) && (map->pmap != NULL)) { pmap_object_init_pt(map->pmap, *addr, object, (vm_pindex_t) OFF_TO_IDX(foff), size, 1); } /* * Shared memory is also shared with children. */ if (flags & (MAP_SHARED|MAP_INHERIT)) { rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) { (void) vm_map_remove(map, *addr, *addr + size); goto out; } } out: switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } - -#ifdef notyet -/* - * Efficient mapping of a .text+.data+.bss object - */ -int -vm_mapaout(map, baseaddr, vp, foff, textsize, datasize, bsssize, addr) - vm_map_t map; - vm_offset_t baseaddr; - struct vnode *vp; - vm_ooffset_t foff; - register vm_size_t textsize, datasize, bsssize; - vm_offset_t *addr; -{ - vm_object_t object; - int rv; - vm_pindex_t objpsize; - struct proc *p = curproc; - - vm_size_t totalsize; - vm_size_t textend; - struct vattr vat; - int error; - - textsize = round_page(textsize); - datasize = round_page(datasize); - bsssize = round_page(bsssize); - totalsize = textsize + datasize + bsssize; - - vm_map_lock(map); - /* - * If baseaddr == -1, then we need to search for space. Otherwise, - * we need to be loaded into a certain spot. - */ - if (baseaddr != (vm_offset_t) -1) { - if (vm_map_findspace(map, baseaddr, totalsize, addr)) { - goto outnomem; - } - - if(*addr != baseaddr) { - goto outnomem; - } - } else { - baseaddr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); - if (vm_map_findspace(map, baseaddr, totalsize, addr)) { - goto outnomem; - } - } - - if (foff & PAGE_MASK) { - vm_map_unlock(map); - return EINVAL; - } - - if ((vp->v_object != 0) && - ((((vm_object_t)vp->v_object)->flags & OBJ_DEAD) == 0)) { - object = vp->v_object; - vm_object_reference(object); - } else { - /* - * get the object size to allocate - */ - error = VOP_GETATTR(vp, &vat, p->p_ucred, p); - if (error) { - vm_map_unlock(map); - return error; - } - objpsize = OFF_TO_IDX(round_page(vat.va_size)); - /* - * Alloc/reference the object - */ - object = vm_pager_allocate(OBJT_VNODE, vp, - objpsize, VM_PROT_ALL, foff); - if (object == NULL) { - goto outnomem; - } - } - - /* - * Insert .text into the map - */ - textend = *addr + textsize; - rv = vm_map_insert(map, object, foff, - *addr, textend, - VM_PROT_READ|VM_PROT_EXECUTE, VM_PROT_ALL, - MAP_COPY_ON_WRITE|MAP_COPY_NEEDED); - if (rv != KERN_SUCCESS) { - vm_object_deallocate(object); - goto out; - } - - /* - * Insert .data into the map, if there is any to map. - */ - if (datasize != 0) { - object->ref_count++; - rv = vm_map_insert(map, object, foff + textsize, - textend, textend + datasize, - VM_PROT_ALL, VM_PROT_ALL, - MAP_COPY_ON_WRITE|MAP_COPY_NEEDED); - if (rv != KERN_SUCCESS) { - --object->ref_count; - vm_map_delete(map, *addr, textend); - goto out; - } - } - - /* - * Preload the page tables - */ - pmap_object_init_pt(map->pmap, *addr, - object, (vm_pindex_t) OFF_TO_IDX(foff), - textsize + datasize, 1); - - /* - * Get the space for bss. - */ - if (bsssize != 0) { - rv = vm_map_insert(map, NULL, 0, - textend + datasize, - *addr + totalsize, - VM_PROT_ALL, VM_PROT_ALL, 0); - } - if (rv != KERN_SUCCESS) { - vm_map_delete(map, *addr, textend + datasize + bsssize); - } - -out: - vm_map_unlock(map); - switch (rv) { - case KERN_SUCCESS: - return 0; - case KERN_INVALID_ADDRESS: - case KERN_NO_SPACE: - return ENOMEM; - case KERN_PROTECTION_FAILURE: - return EACCES; - default: - return EINVAL; - } -outnomem: - vm_map_unlock(map); - return ENOMEM; -} - - -int -mapaout(struct proc *p, struct mapaout_args *uap, int *retval) -{ - - register struct filedesc *fdp = p->p_fd; - struct file *fp; - struct vnode *vp; - int rtval; - - if (((unsigned) uap->fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[uap->fd]) == NULL) - return (EBADF); - if (fp->f_type != DTYPE_VNODE) - return (EINVAL); - - vp = (struct vnode *) fp->f_data; - if ((vp->v_type != VREG) && (vp->v_type != VCHR)) - return (EINVAL); - - rtval = vm_mapaout( &p->p_vmspace->vm_map, - uap->addr, vp, uap->offset, - uap->textsize, uap->datasize, uap->bsssize, - (vm_offset_t *)retval); - - return rtval; -} -#endif Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 17333) +++ head/sys/vm/vm_object.c (revision 17334) @@ -1,1482 +1,1481 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_object.c,v 1.76 1996/06/16 20:37:30 dyson Exp $ + * $Id: vm_object.c,v 1.77 1996/07/27 03:24:03 dyson Exp $ */ /* * Virtual memory object module. */ #include "opt_ddb.h" #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB static void DDB_vm_object_check __P((void)); #endif static void _vm_object_allocate __P((objtype_t, vm_size_t, vm_object_t)); #ifdef DDB static int _vm_object_in_map __P((vm_map_t map, vm_object_t object, vm_map_entry_t entry)); static int vm_object_in_map __P((vm_object_t object)); #endif static void vm_object_qcollapse __P((vm_object_t object)); #ifdef not_used static void vm_object_deactivate_pages __P((vm_object_t)); #endif static void vm_object_terminate __P((vm_object_t)); static void vm_object_cache_trim __P((void)); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ int vm_object_cache_max; struct object_q vm_object_cached_list; static int vm_object_cached; struct object_q vm_object_list; static long vm_object_count; vm_object_t kernel_object; vm_object_t kmem_object; static struct vm_object kernel_object_store; static struct vm_object kmem_object_store; extern int vm_pageout_page_count; static long object_collapses; static long object_bypasses; static void _vm_object_allocate(type, size, object) objtype_t type; vm_size_t size; register vm_object_t object; { TAILQ_INIT(&object->memq); TAILQ_INIT(&object->shadow_head); object->type = type; object->size = size; object->ref_count = 1; object->flags = 0; object->behavior = OBJ_NORMAL; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; object->handle = NULL; object->paging_offset = (vm_ooffset_t) 0; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; object->last_read = 0; TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); vm_object_count++; } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init() { TAILQ_INIT(&vm_object_cached_list); TAILQ_INIT(&vm_object_list); vm_object_count = 0; vm_object_cache_max = 84; if (cnt.v_page_count > 1000) vm_object_cache_max += (cnt.v_page_count - 1000) / 4; kernel_object = &kernel_object_store; _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); kmem_object = &kmem_object_store; _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(type, size) objtype_t type; vm_size_t size; { register vm_object_t result; result = (vm_object_t) malloc((u_long) sizeof *result, M_VMOBJ, M_WAITOK); + _vm_object_allocate(type, size, result); return (result); } /* * vm_object_reference: * * Gets another reference to the given object. */ -void +inline void vm_object_reference(object) register vm_object_t object; { if (object == NULL) return; if (object->ref_count == 0) { if ((object->flags & OBJ_CANPERSIST) == 0) panic("vm_object_reference: non-persistent object with 0 ref_count"); TAILQ_REMOVE(&vm_object_cached_list, object, cached_list); vm_object_cached--; } object->ref_count++; } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(object) vm_object_t object; { vm_object_t temp; while (object != NULL) { if (object->ref_count == 0) panic("vm_object_deallocate: object deallocated too many times"); /* * Lose the reference */ object->ref_count--; if (object->ref_count != 0) { if ((object->ref_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = TAILQ_FIRST(&object->shadow_head); if ((robject != NULL) && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { int s; robject->ref_count += 2; object->ref_count += 2; do { s = splvm(); while (robject->paging_in_progress) { robject->flags |= OBJ_PIPWNT; tsleep(robject, PVM, "objde1", 0); } while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; tsleep(object, PVM, "objde2", 0); } splx(s); } while( object->paging_in_progress || robject->paging_in_progress); object->ref_count -= 2; robject->ref_count -= 2; if( robject->ref_count == 0) { robject->ref_count += 1; object = robject; continue; } vm_object_collapse(robject); return; } } /* * If there are still references, then we are done. */ return; } if (object->type == OBJT_VNODE) { struct vnode *vp = object->handle; vp->v_flag &= ~VTEXT; } /* * See if this object can persist and has some resident * pages. If so, enter it in the cache. */ if (object->flags & OBJ_CANPERSIST) { if (object->resident_page_count != 0) { vm_object_page_clean(object, 0, 0 ,TRUE, TRUE); TAILQ_INSERT_TAIL(&vm_object_cached_list, object, cached_list); vm_object_cached++; vm_object_cache_trim(); return; } else { object->flags &= ~OBJ_CANPERSIST; } } /* * Make sure no one uses us. */ object->flags |= OBJ_DEAD; temp = object->backing_object; if (temp) { TAILQ_REMOVE(&temp->shadow_head, object, shadow_list); --temp->shadow_count; } vm_object_terminate(object); /* unlocks and deallocates object */ object = temp; } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. */ static void vm_object_terminate(object) register vm_object_t object; { register vm_page_t p; int s; /* * wait for the pageout daemon to be done with the object */ s = splvm(); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; tsleep(object, PVM, "objtrm", 0); } splx(s); if (object->paging_in_progress != 0) panic("vm_object_deallocate: pageout in progress"); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = object->handle; VOP_LOCK(vp); vm_object_page_clean(object, 0, 0, TRUE, FALSE); vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); VOP_UNLOCK(vp); } /* * Now free the pages. For internal objects, this also removes them * from paging queues. */ while ((p = TAILQ_FIRST(&object->memq)) != NULL) { -#if defined(DIAGNOSTIC) if (p->flags & PG_BUSY) printf("vm_object_terminate: freeing busy page\n"); -#endif PAGE_WAKEUP(p); vm_page_free(p); cnt.v_pfree++; } /* * Let the pager know object is dead. */ vm_pager_deallocate(object); TAILQ_REMOVE(&vm_object_list, object, object_list); vm_object_count--; wakeup(object); /* * Free the space for the object. */ free((caddr_t) object, M_VMOBJ); } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. * Leaves page on whatever queue it is currently on. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. */ void vm_object_page_clean(object, start, end, syncio, lockflag) vm_object_t object; vm_pindex_t start; vm_pindex_t end; boolean_t syncio; boolean_t lockflag; { register vm_page_t p, np, tp; register vm_offset_t tstart, tend; vm_pindex_t pi; int s; struct vnode *vp; int runlen; int maxf; int chkb; int maxb; int i; vm_page_t maf[vm_pageout_page_count]; vm_page_t mab[vm_pageout_page_count]; vm_page_t ma[vm_pageout_page_count]; if (object->type != OBJT_VNODE || (object->flags & OBJ_MIGHTBEDIRTY) == 0) return; vp = object->handle; if (lockflag) VOP_LOCK(vp); object->flags |= OBJ_CLEANING; tstart = start; if (end == 0) { tend = object->size; } else { tend = end; } if ((tstart == 0) && (tend == object->size)) { object->flags &= ~(OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); } for(p = TAILQ_FIRST(&object->memq); p; p = TAILQ_NEXT(p, listq)) p->flags |= PG_CLEANCHK; rescan: for(p = TAILQ_FIRST(&object->memq); p; p = np) { np = TAILQ_NEXT(p, listq); pi = p->pindex; if (((p->flags & PG_CLEANCHK) == 0) || (pi < tstart) || (pi >= tend) || (p->valid == 0) || (p->queue == PQ_CACHE)) { p->flags &= ~PG_CLEANCHK; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) == 0) { p->flags &= ~PG_CLEANCHK; continue; } s = splvm(); if ((p->flags & PG_BUSY) || p->busy) { p->flags |= PG_WANTED|PG_REFERENCED; tsleep(p, PVM, "vpcwai", 0); splx(s); goto rescan; } splx(s); s = splvm(); maxf = 0; for(i=1;iflags & PG_BUSY) || (tp->flags & PG_CLEANCHK) == 0) break; if (tp->queue == PQ_CACHE) { tp->flags &= ~PG_CLEANCHK; break; } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { tp->flags &= ~PG_CLEANCHK; break; } maf[ i - 1 ] = tp; maxf++; continue; } break; } maxb = 0; chkb = vm_pageout_page_count - maxf; if (chkb) { for(i = 1; i < chkb;i++) { if (tp = vm_page_lookup(object, pi - i)) { if ((tp->flags & PG_BUSY) || (tp->flags & PG_CLEANCHK) == 0) break; if (tp->queue == PQ_CACHE) { tp->flags &= ~PG_CLEANCHK; break; } vm_page_test_dirty(tp); if ((tp->dirty & tp->valid) == 0) { tp->flags &= ~PG_CLEANCHK; break; } mab[ i - 1 ] = tp; maxb++; continue; } break; } } for(i=0;iflags |= PG_BUSY; ma[index]->flags &= ~PG_CLEANCHK; vm_page_protect(ma[index], VM_PROT_READ); } vm_page_protect(p, VM_PROT_READ); p->flags |= PG_BUSY; p->flags &= ~PG_CLEANCHK; ma[maxb] = p; for(i=0;iflags |= PG_BUSY; ma[index]->flags &= ~PG_CLEANCHK; vm_page_protect(ma[index], VM_PROT_READ); } runlen = maxb + maxf + 1; splx(s); vm_pageout_flush(ma, runlen, 0); goto rescan; } VOP_FSYNC(vp, NULL, syncio, curproc); if (lockflag) VOP_UNLOCK(vp); object->flags &= ~OBJ_CLEANING; return; } #ifdef not_used /* XXX I cannot tell if this should be an exported symbol */ /* * vm_object_deactivate_pages * * Deactivate all pages in the specified object. (Keep its pages * in memory even though it is no longer referenced.) * * The object must be locked. */ static void vm_object_deactivate_pages(object) register vm_object_t object; { register vm_page_t p, next; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); vm_page_deactivate(p); } } #endif /* * Trim the object cache to size. */ static void vm_object_cache_trim() { register vm_object_t object; while (vm_object_cached > vm_object_cache_max) { object = TAILQ_FIRST(&vm_object_cached_list); vm_object_reference(object); pager_cache(object, FALSE); } } /* * vm_object_pmap_copy: * * Makes all physical pages in the specified * object range copy-on-write. No writeable * references to these pages should remain. * * The object must *not* be locked. */ void vm_object_pmap_copy(object, start, end) register vm_object_t object; register vm_pindex_t start; register vm_pindex_t end; { register vm_page_t p; if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0) return; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) { vm_page_protect(p, VM_PROT_READ); } object->flags &= ~OBJ_WRITEABLE; } /* * vm_object_pmap_remove: * * Removes all physical pages in the specified * object range from all physical maps. * * The object must *not* be locked. */ void vm_object_pmap_remove(object, start, end) register vm_object_t object; register vm_pindex_t start; register vm_pindex_t end; { register vm_page_t p; if (object == NULL) return; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) { if (p->pindex >= start && p->pindex < end) vm_page_protect(p, VM_PROT_NONE); } } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. */ void vm_object_madvise(object, pindex, count, advise) vm_object_t object; vm_pindex_t pindex; int count; int advise; { vm_pindex_t end; vm_page_t m; if (object == NULL) return; end = pindex + count; for (; pindex < end; pindex += 1) { m = vm_page_lookup(object, pindex); /* * If the page is busy or not in a normal active state, * we skip it. Things can break if we mess with pages * in any of the below states. */ if (m == NULL || m->busy || (m->flags & PG_BUSY) || m->hold_count || m->wire_count || m->valid != VM_PAGE_BITS_ALL) continue; if (advise == MADV_WILLNEED) { if (m->queue != PQ_ACTIVE) vm_page_activate(m); } else if ((advise == MADV_DONTNEED) || ((advise == MADV_FREE) && ((object->type != OBJT_DEFAULT) && (object->type != OBJT_SWAP)))) { vm_page_deactivate(m); } else if (advise == MADV_FREE) { /* * Force a demand-zero on next ref */ if (object->type == OBJT_SWAP) swap_pager_dmzspace(object, m->pindex, 1); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } } } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow(object, offset, length) vm_object_t *object; /* IN/OUT */ vm_ooffset_t *offset; /* IN/OUT */ vm_size_t length; { register vm_object_t source; register vm_object_t result; source = *object; /* * Allocate a new object with the given length */ if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL) panic("vm_object_shadow: no object for shadowing"); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. */ result->backing_object = source; if (source) { TAILQ_INSERT_TAIL(&source->shadow_head, result, shadow_list); ++source->shadow_count; } /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; /* * Return the new things */ *offset = 0; *object = result; } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(object) register vm_object_t object; { register vm_object_t backing_object; register vm_pindex_t backing_offset_index, paging_offset_index; vm_pindex_t backing_object_paging_offset_index; vm_pindex_t new_pindex; register vm_page_t p, pp; register vm_size_t size; backing_object = object->backing_object; if (backing_object->ref_count != 1) return; backing_object->ref_count += 2; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset); paging_offset_index = OFF_TO_IDX(object->paging_offset); size = object->size; p = TAILQ_FIRST(&backing_object->memq); while (p) { vm_page_t next; next = TAILQ_NEXT(p, listq); if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) || (p->queue == PQ_CACHE) || !p->valid || p->hold_count || p->wire_count || p->busy) { p = next; continue; } new_pindex = p->pindex - backing_offset_index; if (p->pindex < backing_offset_index || new_pindex >= size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, backing_object_paging_offset_index+p->pindex, 1); vm_page_protect(p, VM_PROT_NONE); vm_page_free(p); } else { pp = vm_page_lookup(object, new_pindex); if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object, paging_offset_index + new_pindex, NULL, NULL))) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, backing_object_paging_offset_index + p->pindex, 1); vm_page_protect(p, VM_PROT_NONE); vm_page_free(p); } else { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, backing_object_paging_offset_index + p->pindex, 1); vm_page_rename(p, object, new_pindex); p->dirty = VM_PAGE_BITS_ALL; } } p = next; } backing_object->ref_count -= 2; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(object) vm_object_t object; { vm_object_t backing_object; vm_ooffset_t backing_offset; vm_size_t size; vm_pindex_t new_pindex, backing_offset_index; vm_page_t p, pp; while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and no pages in it are currently being paged * out. */ if (object == NULL) return; /* * Make sure there is a backing object. */ if ((backing_object = object->backing_object) == NULL) return; /* * we check the backing object first, because it is most likely * not collapsable. */ if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { return; } if (object->paging_in_progress != 0 || backing_object->paging_in_progress != 0) { vm_object_qcollapse(object); return; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) remove * the parent's reference to it. */ backing_offset = object->backing_object_offset; backing_offset_index = OFF_TO_IDX(backing_offset); size = object->size; /* * If there is exactly one reference to the backing object, we * can collapse it into the parent. */ if (backing_object->ref_count == 1) { backing_object->flags |= OBJ_DEAD; /* * We can collapse the backing object. * * Move all in-memory pages from backing_object to the * parent. Pages that have been paged out will be * overwritten by any of the parent's pages that * shadow them. */ while ((p = TAILQ_FIRST(&backing_object->memq)) != 0) { new_pindex = p->pindex - backing_offset_index; /* * If the parent has a page here, or if this * page falls outside the parent, dispose of * it. * * Otherwise, move it as planned. */ if (p->pindex < backing_offset_index || new_pindex >= size) { vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } else { pp = vm_page_lookup(object, new_pindex); if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) { vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } else { vm_page_rename(p, object, new_pindex); } } } /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { backing_object->paging_in_progress++; if (object->type == OBJT_SWAP) { object->paging_in_progress++; /* * copy shadow object pages into ours * and destroy unneeded pages in * shadow object. */ swap_pager_copy( backing_object, OFF_TO_IDX(backing_object->paging_offset), object, OFF_TO_IDX(object->paging_offset), OFF_TO_IDX(object->backing_object_offset)); vm_object_pip_wakeup(object); } else { object->paging_in_progress++; /* * move the shadow backing_object's pager data to * "object" and convert "object" type to OBJT_SWAP. */ object->type = OBJT_SWAP; object->un_pager.swp.swp_nblocks = backing_object->un_pager.swp.swp_nblocks; object->un_pager.swp.swp_allocsize = backing_object->un_pager.swp.swp_allocsize; object->un_pager.swp.swp_blocks = backing_object->un_pager.swp.swp_blocks; object->un_pager.swp.swp_poip = /* XXX */ backing_object->un_pager.swp.swp_poip; object->paging_offset = backing_object->paging_offset + backing_offset; TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list); /* * Convert backing object from OBJT_SWAP to * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is * actually necessary. */ backing_object->type = OBJT_DEFAULT; TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list); /* * free unnecessary blocks */ swap_pager_freespace(object, 0, OFF_TO_IDX(object->paging_offset)); vm_object_pip_wakeup(object); } vm_object_pip_wakeup(backing_object); } /* * Object now shadows whatever backing_object did. * Note that the reference to backing_object->backing_object * moves from within backing_object to within object. */ TAILQ_REMOVE(&object->backing_object->shadow_head, object, shadow_list); --object->backing_object->shadow_count; if (backing_object->backing_object) { TAILQ_REMOVE(&backing_object->backing_object->shadow_head, backing_object, shadow_list); --backing_object->backing_object->shadow_count; } object->backing_object = backing_object->backing_object; if (object->backing_object) { TAILQ_INSERT_TAIL(&object->backing_object->shadow_head, object, shadow_list); ++object->backing_object->shadow_count; } object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ TAILQ_REMOVE(&vm_object_list, backing_object, object_list); vm_object_count--; free((caddr_t) backing_object, M_VMOBJ); object_collapses++; } else { /* * If all of the pages in the backing object are * shadowed by the parent object, the parent object no * longer has to shadow the backing object; it can * shadow the next one in the chain. * * The backing object must not be paged out - we'd have * to check all of the paged-out pages, as well. */ if (backing_object->type != OBJT_DEFAULT) { return; } /* * Should have a check for a 'small' number of pages * here. */ for (p = TAILQ_FIRST(&backing_object->memq); p; p = TAILQ_NEXT(p, listq)) { new_pindex = p->pindex - backing_offset_index; /* * If the parent has a page here, or if this * page falls outside the parent, keep going. * * Otherwise, the backing_object must be left in * the chain. */ if (p->pindex >= backing_offset_index && new_pindex <= size) { pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) { /* * Page still needed. Can't go any * further. */ return; } } } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ TAILQ_REMOVE(&object->backing_object->shadow_head, object, shadow_list); --object->backing_object->shadow_count; vm_object_reference(object->backing_object = backing_object->backing_object); if (object->backing_object) { TAILQ_INSERT_TAIL(&object->backing_object->shadow_head, object, shadow_list); ++object->backing_object->shadow_count; } object->backing_object_offset += backing_object->backing_object_offset; /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish; * so we don't need to call vm_object_deallocate. */ if (backing_object->ref_count == 1) printf("should have called obj deallocate\n"); backing_object->ref_count--; object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: [internal] * * Removes all physical pages in the specified * object range from the object's list of pages. * * The object must be locked. */ void vm_object_page_remove(object, start, end, clean_only) register vm_object_t object; register vm_pindex_t start; register vm_pindex_t end; boolean_t clean_only; { register vm_page_t p, next; unsigned int size; int s; if (object == NULL) return; object->paging_in_progress++; again: size = end - start; if (size > 4 || size >= object->size / 4) { for (p = TAILQ_FIRST(&object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); if ((start <= p->pindex) && (p->pindex < end)) { if (p->wire_count != 0) { vm_page_protect(p, VM_PROT_NONE); p->valid = 0; continue; } /* * The busy flags are only cleared at * interrupt -- minimize the spl transitions */ if ((p->flags & PG_BUSY) || p->busy) { s = splvm(); if ((p->flags & PG_BUSY) || p->busy) { p->flags |= PG_WANTED; tsleep(p, PVM, "vmopar", 0); splx(s); goto again; } splx(s); } if (clean_only) { vm_page_test_dirty(p); if (p->valid & p->dirty) continue; } vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } } } else { while (size > 0) { if ((p = vm_page_lookup(object, start)) != 0) { if (p->wire_count != 0) { p->valid = 0; vm_page_protect(p, VM_PROT_NONE); start += 1; size -= 1; continue; } /* * The busy flags are only cleared at * interrupt -- minimize the spl transitions */ if ((p->flags & PG_BUSY) || p->busy) { s = splvm(); if ((p->flags & PG_BUSY) || p->busy) { p->flags |= PG_WANTED; tsleep(p, PVM, "vmopar", 0); splx(s); goto again; } splx(s); } if (clean_only) { vm_page_test_dirty(p); if (p->valid & p->dirty) { start += 1; size -= 1; continue; } } vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } start += 1; size -= 1; } } vm_object_pip_wakeup(object); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * next_object Second object into coalesce * next_offset Offset into next_object * * prev_size Size of reference to prev_object * next_size Size of reference to next_object * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(prev_object, prev_pindex, prev_size, next_size) register vm_object_t prev_object; vm_pindex_t prev_pindex; vm_size_t prev_size, next_size; { vm_size_t newsize; if (prev_object == NULL) { return (TRUE); } if (prev_object->type != OBJT_DEFAULT) { return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->ref_count > 1 || prev_object->backing_object != NULL) { return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; /* * Remove any pages that may still be in the object from a previous * deallocation. */ vm_object_page_remove(prev_object, prev_pindex + prev_size, prev_pindex + prev_size + next_size, FALSE); /* * Extend the object if necessary. */ newsize = prev_pindex + prev_size + next_size; if (newsize > prev_object->size) prev_object->size = newsize; return (TRUE); } #ifdef DDB static int _vm_object_in_map(map, object, entry) vm_map_t map; vm_object_t object; vm_map_entry_t entry; { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if( _vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->is_sub_map || entry->is_a_map) { tmpm = entry->object.share_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if( _vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (obj = entry->object.vm_object) { for(; obj; obj=obj->backing_object) if( obj == object) { return 1; } } return 0; } static int vm_object_in_map( object) vm_object_t object; { struct proc *p; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if( !p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if( _vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) return 1; } if( _vm_object_in_map( kernel_map, object, 0)) return 1; if( _vm_object_in_map( kmem_map, object, 0)) return 1; if( _vm_object_in_map( pager_map, object, 0)) return 1; if( _vm_object_in_map( buffer_map, object, 0)) return 1; if( _vm_object_in_map( io_map, object, 0)) return 1; if( _vm_object_in_map( phys_map, object, 0)) return 1; if( _vm_object_in_map( mb_map, object, 0)) return 1; if( _vm_object_in_map( u_map, object, 0)) return 1; return 0; } #ifdef DDB static void DDB_vm_object_check() { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object, object_list)) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { printf("vmochk: internal obj has zero ref count: %d\n", object->size); } if (!vm_object_in_map(object)) { printf("vmochk: internal obj is not in a map: " "ref: %d, size: %d: 0x%x, backing_object: 0x%x\n", object->ref_count, object->size, object->size, object->backing_object); } } } } #endif /* DDB */ /* * vm_object_print: [ debug ] */ void vm_object_print(iobject, full, dummy3, dummy4) /* db_expr_t */ int iobject; boolean_t full; /* db_expr_t */ int dummy3; char *dummy4; { vm_object_t object = (vm_object_t)iobject; /* XXX */ register vm_page_t p; register int count; if (object == NULL) return; iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ", (int) object, (int) object->size, object->resident_page_count, object->ref_count); printf("offset=0x%x, backing_object=(0x%x)+0x%x\n", (int) object->paging_offset, (int) object->backing_object, (int) object->backing_object_offset); printf("cache: next=%p, prev=%p\n", TAILQ_NEXT(object, cached_list), TAILQ_PREV(object, cached_list)); if (!full) return; indent += 2; count = 0; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) { if (count == 0) iprintf("memory:="); else if (count == 6) { printf("\n"); iprintf(" ..."); count = 0; } else printf(","); count++; printf("(off=0x%lx,page=0x%lx)", (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); } if (count != 0) printf("\n"); indent -= 2; } #endif /* DDB */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 17333) +++ head/sys/vm/vm_page.c (revision 17334) @@ -1,1187 +1,1204 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 - * $Id: vm_page.c,v 1.60 1996/06/26 05:39:25 dyson Exp $ + * $Id: vm_page.c,v 1.61 1996/07/27 03:24:05 dyson Exp $ */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB extern void DDB_print_page_info __P((void)); #endif /* * Associated with page of user-allocatable memory is a * page structure. */ static struct pglist *vm_page_buckets; /* Array of buckets */ static int vm_page_bucket_count; /* How big is array? */ static int vm_page_hash_mask; /* Mask for hash function */ struct pglist vm_page_queue_free; struct pglist vm_page_queue_zero; struct pglist vm_page_queue_active; struct pglist vm_page_queue_inactive; struct pglist vm_page_queue_cache; int no_queue; struct { struct pglist *pl; int *cnt; } vm_page_queues[PQ_CACHE+1] = { {NULL, &no_queue}, { &vm_page_queue_free, &cnt.v_free_count}, { &vm_page_queue_zero, &cnt.v_free_count}, { &vm_page_queue_inactive, &cnt.v_inactive_count}, { &vm_page_queue_active, &cnt.v_active_count}, { &vm_page_queue_cache, &cnt.v_cache_count} }; vm_page_t vm_page_array; static int vm_page_array_size; long first_page; static long last_page; static vm_size_t page_mask; static int page_shift; int vm_page_zero_count; /* * map of contiguous valid DEV_BSIZE chunks in a page * (this list is valid for page sizes upto 16*DEV_BSIZE) */ static u_short vm_page_dev_bsize_chunks[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff }; static inline __pure int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex)) __pure2; static int vm_page_freechk_and_unqueue __P((vm_page_t m)); static void vm_page_free_wakeup __P((void)); /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. * * Sets page_shift and page_mask from cnt.v_page_size. */ void vm_set_page_size() { if (cnt.v_page_size == 0) cnt.v_page_size = DEFAULT_PAGE_SIZE; page_mask = cnt.v_page_size - 1; if ((page_mask & cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); for (page_shift = 0;; page_shift++) if ((1 << page_shift) == cnt.v_page_size) break; } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(starta, enda, vaddr) register vm_offset_t starta; vm_offset_t enda; register vm_offset_t vaddr; { register vm_offset_t mapped; register vm_page_t m; register struct pglist *bucket; vm_size_t npages, page_range; register vm_offset_t new_start; int i; vm_offset_t pa; int nblocks; vm_offset_t first_managed_page; /* the biggest memory array is the second group of pages */ vm_offset_t start; vm_offset_t biggestone, biggestsize; vm_offset_t total; total = 0; biggestsize = 0; biggestone = 0; nblocks = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { int size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } ++nblocks; total += size; } start = phys_avail[biggestone]; /* * Initialize the queue headers for the free queue, the active queue * and the inactive queue. */ TAILQ_INIT(&vm_page_queue_free); TAILQ_INIT(&vm_page_queue_zero); TAILQ_INIT(&vm_page_queue_active); TAILQ_INIT(&vm_page_queue_inactive); TAILQ_INIT(&vm_page_queue_cache); /* * Allocate (and initialize) the hash table buckets. * * The number of buckets MUST BE a power of 2, and the actual value is * the next power of 2 greater than the number of physical pages in * the system. * * Note: This computation can be tweaked if desired. */ vm_page_buckets = (struct pglist *) vaddr; bucket = vm_page_buckets; if (vm_page_bucket_count == 0) { vm_page_bucket_count = 1; while (vm_page_bucket_count < atop(total)) vm_page_bucket_count <<= 1; } vm_page_hash_mask = vm_page_bucket_count - 1; /* * Validate these addresses. */ new_start = start + vm_page_bucket_count * sizeof(struct pglist); new_start = round_page(new_start); mapped = vaddr; vaddr = pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); start = new_start; bzero((caddr_t) mapped, vaddr - mapped); mapped = vaddr; for (i = 0; i < vm_page_bucket_count; i++) { TAILQ_INIT(bucket); bucket++; } /* * round (or truncate) the addresses to our page size. */ /* * Pre-allocate maps and map entries that cannot be dynamically * allocated via malloc(). The maps include the kernel_map and * kmem_map which must be initialized before malloc() will work * (obviously). Also could include pager maps which would be * allocated before kmeminit. * * Allow some kernel map entries... this should be plenty since people * shouldn't be cluttering up the kernel map (they should use their * own maps). */ kentry_data_size = MAX_KMAP * sizeof(struct vm_map) + MAX_KMAPENT * sizeof(struct vm_map_entry); kentry_data_size = round_page(kentry_data_size); kentry_data = (vm_offset_t) vaddr; vaddr += kentry_data_size; /* * Validate these zone addresses. */ new_start = start + (vaddr - mapped); pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); bzero((caddr_t) mapped, (vaddr - mapped)); start = round_page(new_start); /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = phys_avail[0] / PAGE_SIZE; last_page = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE; page_range = last_page - (phys_avail[0] / PAGE_SIZE); npages = (total - (page_range * sizeof(struct vm_page)) - (start - phys_avail[biggestone])) / PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ vm_page_array = (vm_page_t) vaddr; mapped = vaddr; /* * Validate these addresses. */ new_start = round_page(start + page_range * sizeof(struct vm_page)); mapped = pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); start = new_start; first_managed_page = start / PAGE_SIZE; /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); vm_page_array_size = page_range; cnt.v_page_count = 0; cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { if (i == biggestone) pa = ptoa(first_managed_page); else pa = phys_avail[i]; while (pa < phys_avail[i + 1] && npages-- > 0) { ++cnt.v_page_count; ++cnt.v_free_count; m = PHYS_TO_VM_PAGE(pa); m->queue = PQ_FREE; m->flags = 0; m->phys_addr = pa; TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); pa += PAGE_SIZE; } } return (mapped); } /* * vm_page_hash: * * Distributes the object/offset key pair among hash buckets. * * NOTE: This macro depends on vm_page_bucket_count being a power of 2. */ static inline __pure int vm_page_hash(object, pindex) vm_object_t object; vm_pindex_t pindex; { return ((((unsigned) object) >> 5) + (pindex >> 1)) & vm_page_hash_mask; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object/object-page * table and object list. * * The object and page must be locked, and must be splhigh. */ -void +__inline void vm_page_insert(m, object, pindex) register vm_page_t m; register vm_object_t object; register vm_pindex_t pindex; { register struct pglist *bucket; if (m->flags & PG_TABLED) panic("vm_page_insert: already inserted"); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Insert it into the object_object/offset hash table */ bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; TAILQ_INSERT_TAIL(bucket, m, hashq); /* * Now link into the object's list of backed pages. */ TAILQ_INSERT_TAIL(&object->memq, m, listq); m->flags |= PG_TABLED; /* * And show that the object has one more resident page. */ object->resident_page_count++; } /* * vm_page_remove: [ internal use only ] * NOTE: used by device pager as well -wfj * * Removes the given mem entry from the object/offset-page * table and the object page list. * * The object and page must be locked, and at splhigh. */ -void +__inline void vm_page_remove(m) register vm_page_t m; { register struct pglist *bucket; if (!(m->flags & PG_TABLED)) return; /* * Remove from the object_object/offset hash table */ bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)]; TAILQ_REMOVE(bucket, m, hashq); /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&m->object->memq, m, listq); /* * And show that the object has one fewer resident page. */ m->object->resident_page_count--; m->flags &= ~PG_TABLED; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. No side effects. */ vm_page_t vm_page_lookup(object, pindex) register vm_object_t object; register vm_pindex_t pindex; { register vm_page_t m; register struct pglist *bucket; int s; /* * Search the hash table for this object/offset pair */ bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; s = splvm(); for (m = TAILQ_FIRST(bucket); m != NULL; m = TAILQ_NEXT(m,hashq)) { if ((m->object == object) && (m->pindex == pindex)) { splx(s); return (m); } } splx(s); return (NULL); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * The object must be locked. */ void vm_page_rename(m, new_object, new_pindex) register vm_page_t m; register vm_object_t new_object; vm_pindex_t new_pindex; { int s; s = splvm(); vm_page_remove(m); vm_page_insert(m, new_object, new_pindex); splx(s); } /* + * vm_page_unqueue without any wakeup + */ +__inline void +vm_page_unqueue_nowakeup(m) + vm_page_t m; +{ + int queue = m->queue; + if (queue != PQ_NONE) { + m->queue = PQ_NONE; + TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq); + --(*vm_page_queues[queue].cnt); + } +} + + +/* * vm_page_unqueue must be called at splhigh(); */ __inline void -vm_page_unqueue(m, wakeup) +vm_page_unqueue(m) vm_page_t m; - int wakeup; { int queue = m->queue; if (queue != PQ_NONE) { m->queue = PQ_NONE; TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq); --(*vm_page_queues[queue].cnt); - if ((queue == PQ_CACHE) && wakeup) { + if (queue == PQ_CACHE) { if ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_free_reserved + cnt.v_cache_min)) pagedaemon_wakeup(); } } } /* * vm_page_alloc: * * Allocate and return a memory cell associated * with this VM object/offset pair. * * page_req classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * VM_ALLOC_ZERO zero page * * Object must be locked. */ vm_page_t vm_page_alloc(object, pindex, page_req) vm_object_t object; vm_pindex_t pindex; int page_req; { register vm_page_t m; int queue; int s; #ifdef DIAGNOSTIC m = vm_page_lookup(object, pindex); if (m) panic("vm_page_alloc: page already allocated"); #endif if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { page_req = VM_ALLOC_SYSTEM; }; s = splvm(); switch (page_req) { case VM_ALLOC_NORMAL: if (cnt.v_free_count >= cnt.v_free_reserved) { m = TAILQ_FIRST(&vm_page_queue_free); if (m == NULL) { --vm_page_zero_count; m = TAILQ_FIRST(&vm_page_queue_zero); } } else { m = TAILQ_FIRST(&vm_page_queue_cache); if (m == NULL) { splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_ZERO: if (cnt.v_free_count >= cnt.v_free_reserved) { m = TAILQ_FIRST(&vm_page_queue_zero); if (m) { --vm_page_zero_count; } else { m = TAILQ_FIRST(&vm_page_queue_free); } } else { m = TAILQ_FIRST(&vm_page_queue_cache); if (m == NULL) { splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(ZERO): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_SYSTEM: if ((cnt.v_free_count >= cnt.v_free_reserved) || ((cnt.v_cache_count == 0) && (cnt.v_free_count >= cnt.v_interrupt_free_min))) { m = TAILQ_FIRST(&vm_page_queue_free); if (m == NULL) { --vm_page_zero_count; m = TAILQ_FIRST(&vm_page_queue_zero); } } else { m = TAILQ_FIRST(&vm_page_queue_cache); if (m == NULL) { splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(SYSTEM): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_INTERRUPT: if (cnt.v_free_count > 0) { m = TAILQ_FIRST(&vm_page_queue_free); if (m == NULL) { --vm_page_zero_count; m = TAILQ_FIRST(&vm_page_queue_zero); } } else { splx(s); pagedaemon_wakeup(); return (NULL); } break; default: panic("vm_page_alloc: invalid allocation class"); } queue = m->queue; TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq); --(*vm_page_queues[queue].cnt); if (queue == PQ_ZERO) { m->flags = PG_ZERO|PG_BUSY; } else if (queue == PQ_CACHE) { vm_page_remove(m); m->flags = PG_BUSY; } else { m->flags = PG_BUSY; } m->wire_count = 0; m->hold_count = 0; m->act_count = 0; m->busy = 0; m->valid = 0; m->dirty = 0; m->queue = PQ_NONE; /* XXX before splx until vm_page_insert is safe */ vm_page_insert(m, object, pindex); splx(s); /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_reserved + cnt.v_cache_min)) || (cnt.v_free_count < cnt.v_pageout_free_min)) pagedaemon_wakeup(); return (m); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * * The page queues must be locked. */ void vm_page_activate(m) register vm_page_t m; { int s; s = splvm(); if (m->queue == PQ_ACTIVE) panic("vm_page_activate: already active"); if (m->queue == PQ_CACHE) cnt.v_reactivated++; - vm_page_unqueue(m, 1); + vm_page_unqueue(m); if (m->wire_count == 0) { TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); m->queue = PQ_ACTIVE; if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; cnt.v_active_count++; } splx(s); } /* * helper routine for vm_page_free and vm_page_free_zero */ -__inline static int +static int vm_page_freechk_and_unqueue(m) vm_page_t m; { if (m->busy || (m->flags & PG_BUSY) || (m->queue == PQ_FREE) || (m->hold_count != 0)) { printf("vm_page_free: pindex(%ld), busy(%d), PG_BUSY(%d), hold(%d)\n", m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0, m->hold_count); if (m->queue == PQ_FREE) panic("vm_page_free: freeing free page"); else panic("vm_page_free: freeing busy page"); } vm_page_remove(m); - vm_page_unqueue(m,0); + vm_page_unqueue_nowakeup(m); if ((m->flags & PG_FICTITIOUS) != 0) { return 0; } if (m->wire_count != 0) { if (m->wire_count > 1) { panic("vm_page_free: invalid wire count (%d), pindex: 0x%x", m->wire_count, m->pindex); } m->wire_count = 0; cnt.v_wire_count--; } return 1; } /* * helper routine for vm_page_free and vm_page_free_zero */ -__inline static void +static __inline void vm_page_free_wakeup() { /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && ((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) { wakeup(&cnt.v_free_count); vm_pages_needed = 0; } } /* * vm_page_free: * * Returns the given page to the free list, * disassociating it with any VM object. * * Object and page must be locked prior to entry. */ void vm_page_free(m) register vm_page_t m; { int s; s = splvm(); cnt.v_tfree++; if (!vm_page_freechk_and_unqueue(m)) { splx(s); return; } m->queue = PQ_FREE; /* * If the pageout process is grabbing the page, it is likely * that the page is NOT in the cache. It is more likely that * the page will be partially in the cache if it is being * explicitly freed. */ if (curproc == pageproc) { TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); } else { TAILQ_INSERT_HEAD(&vm_page_queue_free, m, pageq); } cnt.v_free_count++; vm_page_free_wakeup(); splx(s); } void vm_page_free_zero(m) register vm_page_t m; { int s; s = splvm(); cnt.v_tfree++; if (!vm_page_freechk_and_unqueue(m)) { splx(s); return; } m->queue = PQ_ZERO; TAILQ_INSERT_HEAD(&vm_page_queue_zero, m, pageq); ++vm_page_zero_count; cnt.v_free_count++; vm_page_free_wakeup(); splx(s); } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * The page queues must be locked. */ void vm_page_wire(m) register vm_page_t m; { int s; if (m->wire_count == 0) { s = splvm(); - vm_page_unqueue(m,1); + vm_page_unqueue(m); splx(s); cnt.v_wire_count++; } m->wire_count++; m->flags |= PG_MAPPED; } /* * vm_page_unwire: * * Release one wiring of this page, potentially * enabling it to be paged again. * * The page queues must be locked. */ void vm_page_unwire(m) register vm_page_t m; { int s; s = splvm(); if (m->wire_count > 0) m->wire_count--; if (m->wire_count == 0) { cnt.v_wire_count--; TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); m->queue = PQ_ACTIVE; cnt.v_active_count++; } splx(s); } /* * vm_page_deactivate: * * Returns the given page to the inactive list, * indicating that no physical maps have access * to this page. [Used by the physical mapping system.] * * The page queues must be locked. */ void vm_page_deactivate(m) register vm_page_t m; { int s; /* * Only move active pages -- ignore locked or already inactive ones. * * XXX: sometimes we get pages which aren't wired down or on any queue - * we need to put them on the inactive queue also, otherwise we lose * track of them. Paul Mackerras (paulus@cs.anu.edu.au) 9-Jan-93. */ if (m->queue == PQ_INACTIVE) return; s = splvm(); if (m->wire_count == 0 && m->hold_count == 0) { if (m->queue == PQ_CACHE) cnt.v_reactivated++; - vm_page_unqueue(m,1); + vm_page_unqueue(m); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); m->queue = PQ_INACTIVE; cnt.v_inactive_count++; } splx(s); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). */ void vm_page_cache(m) register vm_page_t m; { int s; if ((m->flags & PG_BUSY) || m->busy || m->wire_count) { printf("vm_page_cache: attempting to cache busy page\n"); return; } if (m->queue == PQ_CACHE) return; vm_page_protect(m, VM_PROT_NONE); if (m->dirty != 0) { panic("vm_page_cache: caching a dirty page, pindex: %d", m->pindex); } s = splvm(); - vm_page_unqueue(m,0); + vm_page_unqueue_nowakeup(m); TAILQ_INSERT_TAIL(&vm_page_queue_cache, m, pageq); m->queue = PQ_CACHE; cnt.v_cache_count++; vm_page_free_wakeup(); splx(s); } /* * mapping function for valid bits or for dirty bits in * a page */ inline int vm_page_bits(int base, int size) { u_short chunk; if ((base == 0) && (size >= PAGE_SIZE)) return VM_PAGE_BITS_ALL; size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); base = (base % PAGE_SIZE) / DEV_BSIZE; chunk = vm_page_dev_bsize_chunks[size / DEV_BSIZE]; return (chunk << base) & VM_PAGE_BITS_ALL; } /* * set a page valid and clean */ void vm_page_set_validclean(m, base, size) vm_page_t m; int base; int size; { int pagebits = vm_page_bits(base, size); m->valid |= pagebits; m->dirty &= ~pagebits; if( base == 0 && size == PAGE_SIZE) - pmap_tc_modified(m); + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); } /* * set a page (partially) invalid */ void vm_page_set_invalid(m, base, size) vm_page_t m; int base; int size; { int bits; m->valid &= ~(bits = vm_page_bits(base, size)); if (m->valid == 0) m->dirty &= ~bits; } /* * is (partial) page valid? */ int vm_page_is_valid(m, base, size) vm_page_t m; int base; int size; { int bits = vm_page_bits(base, size); if (m->valid && ((m->valid & bits) == bits)) return 1; else return 0; } void vm_page_test_dirty(m) vm_page_t m; { - if (m->dirty != VM_PAGE_BITS_ALL) - pmap_tc_modified(m); + if ((m->dirty != VM_PAGE_BITS_ALL) && + pmap_is_modified(VM_PAGE_TO_PHYS(m))) { + m->dirty = VM_PAGE_BITS_ALL; + } } /* * This interface is for merging with malloc() someday. * Even if we never implement compaction so that contiguous allocation * works after initialization time, malloc()'s data structures are good * for statistics and for allocations of less than a page. */ void * contigmalloc(size, type, flags, low, high, alignment, boundary) unsigned long size; /* should be size_t here and for malloc() */ int type; int flags; unsigned long low; unsigned long high; unsigned long alignment; unsigned long boundary; { int i, s, start; vm_offset_t addr, phys, tmp_addr; vm_page_t pga = vm_page_array; size = round_page(size); if (size == 0) panic("vm_page_alloc_contig: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("vm_page_alloc_contig: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("vm_page_alloc_contig: boundary must be a power of 2"); start = 0; s = splvm(); again: /* * Find first page in array that is free, within range, aligned, and * such that the boundary won't be crossed. */ for (i = start; i < cnt.v_page_count; i++) { phys = VM_PAGE_TO_PHYS(&pga[i]); if ((pga[i].queue == PQ_FREE) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) break; } /* * If the above failed or we will exceed the upper bound, fail. */ if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { splx(s); return (NULL); } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { if ((VM_PAGE_TO_PHYS(&pga[i]) != (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || (pga[i].queue != PQ_FREE)) { start++; goto again; } } /* * We've found a contiguous chunk that meets are requirements. * Allocate kernel VM, unfree and assign the physical pages to it and * return kernel VM pointer. */ tmp_addr = addr = kmem_alloc_pageable(kernel_map, size); if (addr == 0) { splx(s); return (NULL); } for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; TAILQ_REMOVE(&vm_page_queue_free, m, pageq); cnt.v_free_count--; m->valid = VM_PAGE_BITS_ALL; m->flags = 0; m->dirty = 0; m->wire_count = 0; m->busy = 0; m->queue = PQ_NONE; vm_page_insert(m, kernel_object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); vm_page_wire(m); pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m)); tmp_addr += PAGE_SIZE; } splx(s); return ((void *)addr); } vm_offset_t vm_page_alloc_contig(size, low, high, alignment) vm_offset_t size; vm_offset_t low; vm_offset_t high; vm_offset_t alignment; { return ((vm_offset_t)contigmalloc(size, M_DEVBUF, M_NOWAIT, low, high, alignment, 0ul)); } #ifdef DDB void DDB_print_page_info(void) { printf("cnt.v_free_count: %d\n", cnt.v_free_count); printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); printf("cnt.v_active_count: %d\n", cnt.v_active_count); printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); printf("cnt.v_free_min: %d\n", cnt.v_free_min); printf("cnt.v_free_target: %d\n", cnt.v_free_target); printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); } #endif Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h (revision 17333) +++ head/sys/vm/vm_page.h (revision 17334) @@ -1,334 +1,334 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_page.h,v 1.29 1996/06/26 05:39:25 dyson Exp $ + * $Id: vm_page.h,v 1.30 1996/07/27 03:24:06 dyson Exp $ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several lists: * * A hash table bucket used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * Fields in this structure are locked either by the lock on the * object that the page belongs to (O) or by the lock on the page * queues (P). */ TAILQ_HEAD(pglist, vm_page); struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO queue or free list (P) */ TAILQ_ENTRY(vm_page) hashq; /* hash table links (O) */ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_offset_t phys_addr; /* physical address of page */ u_short queue:4, /* page queue index */ flags:12; /* see below */ u_short wire_count; /* wired down maps refs (P) */ short hold_count; /* page hold count */ u_char act_count; /* page usage count */ u_char busy; /* page busy count */ /* NOTE that these must support one bit per DEV_BSIZE in a page!!! */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ u_char valid; /* map of valid DEV_BSIZE chunks */ u_char dirty; /* map of dirty DEV_BSIZE chunks */ }; #define PQ_NONE 0 #define PQ_FREE 1 #define PQ_ZERO 2 #define PQ_INACTIVE 3 #define PQ_ACTIVE 4 #define PQ_CACHE 5 /* * These are the flags defined for vm_page. * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. */ #define PG_BUSY 0x01 /* page is in transit (O) */ #define PG_WANTED 0x02 /* someone is waiting for page (O) */ #define PG_TABLED 0x04 /* page is in VP table (O) */ #define PG_FICTITIOUS 0x08 /* physical page doesn't exist (O) */ #define PG_WRITEABLE 0x10 /* page is mapped writeable */ #define PG_MAPPED 0x20 /* page is mapped */ #define PG_ZERO 0x40 /* page is zeroed */ #define PG_REFERENCED 0x80 /* page has been referenced */ #define PG_CLEANCHK 0x100 /* page has been checked for cleaning */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 32 #define PFCLUSTER_BEHIND 3 #define PFCLUSTER_AHEAD 3 #ifdef KERNEL /* * Each pageable resident page falls into one of four lists: * * free * Available for allocation now. * * The following are all LRU sorted: * * cache * Almost available for allocation. Still in an * object, but clean and immediately freeable at * non-interrupt times. * * inactive * Low activity, candidates for reclamation. * This is the list of pages that should be * paged out next. * * active * Pages that are "active" i.e. they have been * recently referenced. * * zero * Pages that are really free and have been pre-zeroed * */ extern struct pglist vm_page_queue_free; /* memory free queue */ extern struct pglist vm_page_queue_zero; /* zeroed memory free queue */ extern struct pglist vm_page_queue_active; /* active memory queue */ extern struct pglist vm_page_queue_inactive; /* inactive memory queue */ extern struct pglist vm_page_queue_cache; /* cache memory queue */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern long first_page; /* first physical page number */ /* ... represented in vm_page_array */ extern long last_page; /* last physical page number */ /* ... represented in vm_page_array */ /* [INCLUSIVE] */ extern vm_offset_t first_phys_addr; /* physical address for first_page */ extern vm_offset_t last_phys_addr; /* physical address for last_page */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) #define IS_VM_PHYSADDR(pa) \ ((pa) >= first_phys_addr && (pa) <= last_phys_addr) #define PHYS_TO_VM_PAGE(pa) \ (&vm_page_array[atop(pa) - first_page ]) /* * Functions implemented as macros */ #define PAGE_ASSERT_WAIT(m, interruptible) { \ (m)->flags |= PG_WANTED; \ assert_wait((int) (m), (interruptible)); \ } #define PAGE_WAKEUP(m) { \ (m)->flags &= ~PG_BUSY; \ if ((m)->flags & PG_WANTED) { \ (m)->flags &= ~PG_WANTED; \ - (m)->flags |= PG_REFERENCED; \ wakeup((caddr_t) (m)); \ } \ } #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xff #endif #if PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffff #endif #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_ZERO 3 void vm_page_activate __P((vm_page_t)); vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); void vm_page_deactivate __P((vm_page_t)); void vm_page_free __P((vm_page_t)); void vm_page_free_zero __P((vm_page_t)); void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t)); void vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); void vm_page_unwire __P((vm_page_t)); void vm_page_wire __P((vm_page_t)); -void vm_page_unqueue __P((vm_page_t, int)); +void vm_page_unqueue __P((vm_page_t)); +void vm_page_unqueue_nowakeup __P((vm_page_t)); void vm_page_set_validclean __P((vm_page_t, int, int)); void vm_page_set_invalid __P((vm_page_t, int, int)); static __inline boolean_t vm_page_zero_fill __P((vm_page_t)); int vm_page_is_valid __P((vm_page_t, int, int)); void vm_page_test_dirty __P((vm_page_t)); int vm_page_bits __P((int, int)); /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ static __inline void vm_page_hold(vm_page_t mem) { mem->hold_count++; } #ifdef DIAGNOSTIC #include /* make GCC shut up */ #endif static __inline void vm_page_unhold(vm_page_t mem) { #ifdef DIAGNOSTIC if (--mem->hold_count < 0) panic("vm_page_unhold: hold count < 0!!!"); #else --mem->hold_count; #endif } static __inline void vm_page_protect(vm_page_t mem, int prot) { if (prot == VM_PROT_NONE) { if (mem->flags & (PG_WRITEABLE|PG_MAPPED)) { - pmap_page_protect(mem, prot); + pmap_page_protect(VM_PAGE_TO_PHYS(mem), prot); mem->flags &= ~(PG_WRITEABLE|PG_MAPPED); } } else if ((prot == VM_PROT_READ) && (mem->flags & PG_WRITEABLE)) { - pmap_page_protect(mem, prot); + pmap_page_protect(VM_PAGE_TO_PHYS(mem), prot); mem->flags &= ~PG_WRITEABLE; } } /* * vm_page_zero_fill: * * Zero-fill the specified page. * Written as a standard pagein routine, to * be used by the zero-fill object. */ static __inline boolean_t vm_page_zero_fill(m) vm_page_t m; { pmap_zero_page(VM_PAGE_TO_PHYS(m)); return (TRUE); } /* * vm_page_copy: * * Copy one page to another */ static __inline void vm_page_copy(src_m, dest_m) vm_page_t src_m; vm_page_t dest_m; { pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); dest_m->valid = VM_PAGE_BITS_ALL; } #endif /* KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 17333) +++ head/sys/vm/vm_pageout.c (revision 17334) @@ -1,1100 +1,1102 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.c,v 1.81 1996/07/08 02:25:53 dyson Exp $ + * $Id: vm_pageout.c,v 1.83 1996/07/27 03:24:08 dyson Exp $ */ /* * The proverbial page-out daemon. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout __P((void)); static int vm_pageout_clean __P((vm_page_t, int)); static int vm_pageout_scan __P((void)); static int vm_pageout_free_page_calc __P((vm_size_t count)); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT_KT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon __P((void)); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT_KT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) #endif int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ extern int npendingio; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; #endif extern int nswiodone; extern int vm_swap_size; extern int vfs_update_wakeup; int vm_pageout_algorithm_lru=0; #if defined(NO_SWAPPING) int vm_swapping_enabled=0; #else int vm_swapping_enabled=1; #endif SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, ""); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swapping_enabled, CTLFLAG_RD, &vm_swapping_enabled, 0, ""); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swapping_enabled, CTLFLAG_RW, &vm_swapping_enabled, 0, ""); #endif #define MAXLAUNDER (cnt.v_page_count > 1800 ? 32 : 16) #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ #if !defined(NO_SWAPPING) typedef void freeer_fcn_t __P((vm_map_t, vm_object_t, vm_pindex_t, int)); static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_pindex_t)); static freeer_fcn_t vm_pageout_object_deactivate_pages; static void vm_req_vmdaemon __P((void)); #endif /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. * * And we set pageout-in-progress to keep the object from disappearing * during pageout. This guarantees that the page won't move from the * inactive queue. (However, any other page on the inactive queue may * move!) */ static int vm_pageout_clean(m, sync) vm_page_t m; int sync; { register vm_object_t object; vm_page_t mc[2*vm_pageout_page_count]; int pageout_count; int i, forward_okay, backward_okay, page_base; vm_pindex_t pindex = m->pindex; object = m->object; /* * If not OBJT_SWAP, additional memory may be needed to do the pageout. * Try to avoid the deadlock. */ if ((sync != VM_PAGEOUT_FORCE) && (object->type == OBJT_DEFAULT) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)) return 0; /* * Don't mess with the page if it's busy. */ if ((!sync && m->hold_count != 0) || ((m->busy != 0) || (m->flags & PG_BUSY))) return 0; /* * Try collapsing before it's too late. */ if (!sync && object->backing_object) { vm_object_collapse(object); } mc[vm_pageout_page_count] = m; pageout_count = 1; page_base = vm_pageout_page_count; forward_okay = TRUE; if (pindex != 0) backward_okay = TRUE; else backward_okay = FALSE; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. */ for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) { vm_page_t p; /* * See if forward page is clusterable. */ if (forward_okay) { /* * Stop forward scan at end of object. */ if ((pindex + i) > object->size) { forward_okay = FALSE; goto do_backward; } p = vm_page_lookup(object, pindex + i); if (p) { if ((p->queue == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) { forward_okay = FALSE; goto do_backward; } vm_page_test_dirty(p); if ((p->dirty & p->valid) != 0 && ((p->queue == PQ_INACTIVE) || (sync == VM_PAGEOUT_FORCE)) && (p->wire_count == 0) && (p->hold_count == 0)) { mc[vm_pageout_page_count + i] = p; pageout_count++; if (pageout_count == vm_pageout_page_count) break; } else { forward_okay = FALSE; } } else { forward_okay = FALSE; } } do_backward: /* * See if backward page is clusterable. */ if (backward_okay) { /* * Stop backward scan at beginning of object. */ if ((pindex - i) == 0) { backward_okay = FALSE; } p = vm_page_lookup(object, pindex - i); if (p) { if ((p->queue == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) { backward_okay = FALSE; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) != 0 && ((p->queue == PQ_INACTIVE) || (sync == VM_PAGEOUT_FORCE)) && (p->wire_count == 0) && (p->hold_count == 0)) { mc[vm_pageout_page_count - i] = p; pageout_count++; page_base--; if (pageout_count == vm_pageout_page_count) break; } else { backward_okay = FALSE; } } else { backward_okay = FALSE; } } } + /* + * we allow reads during pageouts... + */ for (i = page_base; i < (page_base + pageout_count); i++) { mc[i]->flags |= PG_BUSY; - vm_page_protect(mc[i], VM_PROT_NONE); + vm_page_protect(mc[i], VM_PROT_READ); } return vm_pageout_flush(&mc[page_base], pageout_count, sync); } int vm_pageout_flush(mc, count, sync) vm_page_t *mc; int count; int sync; { register vm_object_t object; int pageout_status[count]; int anyok = 0; int i; object = mc[0]->object; object->paging_in_progress += count; vm_pager_put_pages(object, mc, count, ((sync || (object == kernel_object)) ? TRUE : FALSE), pageout_status); for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; switch (pageout_status[i]) { case VM_PAGER_OK: ++anyok; break; case VM_PAGER_PEND: ++anyok; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ - pmap_tc_modified(mt); + pmap_clear_modify(VM_PAGE_TO_PHYS(mt)); mt->dirty = 0; break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ if (mt->queue == PQ_INACTIVE) vm_page_activate(mt); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); PAGE_WAKEUP(mt); } } return anyok; } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * backing_objects. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only) vm_map_t map; vm_object_t object; vm_pindex_t desired; int map_remove_only; { register vm_page_t p, next; int rcount; int remove_mode; int s; if (object->type == OBJT_DEVICE) return; while (object) { if (vm_map_pmap(map)->pm_stats.resident_count <= desired) return; if (object->paging_in_progress) return; remove_mode = map_remove_only; if (object->shadow_count > 1) remove_mode = 1; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = TAILQ_FIRST(&object->memq); while (p && (rcount-- > 0)) { int refcount; if (vm_map_pmap(map)->pm_stats.resident_count <= desired) return; next = TAILQ_NEXT(p, listq); cnt.v_pdpages++; if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 || (p->flags & PG_BUSY) || !pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { p = next; continue; } - refcount = pmap_tc_referenced(VM_PAGE_TO_PHYS(p)); + refcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(p)); if (refcount) { p->flags |= PG_REFERENCED; } else if (p->flags & PG_REFERENCED) { refcount = 1; } if ((p->queue != PQ_ACTIVE) && (p->flags & PG_REFERENCED)) { vm_page_activate(p); p->act_count += refcount; p->flags &= ~PG_REFERENCED; } else if (p->queue == PQ_ACTIVE) { if ((p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) { vm_page_protect(p, VM_PROT_NONE); vm_page_deactivate(p); } else { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); splx(s); } } else { p->flags &= ~PG_REFERENCED; if (p->act_count < (ACT_MAX - ACT_ADVANCE)) p->act_count += ACT_ADVANCE; s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); splx(s); } } else if (p->queue == PQ_INACTIVE) { vm_page_protect(p, VM_PROT_NONE); } p = next; } object = object->backing_object; } return; } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; vm_pindex_t desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; vm_map_reference(map); if (!lock_try_write(&map->lock)) { vm_map_deallocate(map); return; } bigobj = NULL; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->is_sub_map == 0) && (tmpe->is_a_map == 0)) { obj = tmpe->object.vm_object; if ((obj != NULL) && (obj->shadow_count <= 1) && ((bigobj == NULL) || (bigobj->resident_page_count < obj->resident_page_count))) { bigobj = obj; } } tmpe = tmpe->next; } if (bigobj) vm_pageout_object_deactivate_pages(map, bigobj, desired, 0); /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (vm_map_pmap(map)->pm_stats.resident_count <= desired) break; if ((tmpe->is_sub_map == 0) && (tmpe->is_a_map == 0)) { obj = tmpe->object.vm_object; if (obj) vm_pageout_object_deactivate_pages(map, obj, desired, 0); } tmpe = tmpe->next; }; /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0) pmap_remove(vm_map_pmap(map), VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); vm_map_unlock(map); vm_map_deallocate(map); return; } #endif /* * vm_pageout_scan does the dirty work for the pageout daemon. */ static int vm_pageout_scan() { vm_page_t m, next; int page_shortage, addl_page_shortage, maxscan, maxlaunder, pcount; int pages_freed; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; int force_wakeup = 0; int vnodes_skipped = 0; int s; /* * Start scanning the inactive queue for pages we can free. We keep * scanning until we have enough free pages or we have scanned through * the entire queue. If we encounter dirty pages, we start cleaning * them. */ pages_freed = 0; addl_page_shortage = 0; maxlaunder = (cnt.v_inactive_target > MAXLAUNDER) ? MAXLAUNDER : cnt.v_inactive_target; - +rescan0: maxscan = cnt.v_inactive_count; for( m = TAILQ_FIRST(&vm_page_queue_inactive); (m != NULL) && (maxscan-- > 0) && ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_cache_min + cnt.v_free_target)); m = next) { cnt.v_pdpages++; if (m->queue != PQ_INACTIVE) { - break; + goto rescan0; } next = TAILQ_NEXT(m, pageq); if (m->hold_count) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); addl_page_shortage++; continue; } /* * Dont mess with busy pages, keep in the front of the * queue, most likely are being paged out. */ if (m->busy || (m->flags & PG_BUSY)) { addl_page_shortage++; continue; } - if (m->valid != 0) { - if (m->object->ref_count == 0) { - m->flags &= ~PG_REFERENCED; - pmap_tc_referenced(VM_PAGE_TO_PHYS(m)); - } else if (((m->flags & PG_REFERENCED) == 0) && - pmap_tc_referenced(VM_PAGE_TO_PHYS(m))) { - vm_page_activate(m); - continue; - } + if (m->object->ref_count == 0) { + m->flags &= ~PG_REFERENCED; + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + } else if (((m->flags & PG_REFERENCED) == 0) && + pmap_ts_referenced(VM_PAGE_TO_PHYS(m))) { + vm_page_activate(m); + continue; + } - if ((m->flags & PG_REFERENCED) != 0) { - m->flags &= ~PG_REFERENCED; - pmap_tc_referenced(VM_PAGE_TO_PHYS(m)); - vm_page_activate(m); - continue; - } - if (m->dirty == 0) { - vm_page_test_dirty(m); - } else if (m->dirty != 0) { - m->dirty = VM_PAGE_BITS_ALL; - } - } + if ((m->flags & PG_REFERENCED) != 0) { + m->flags &= ~PG_REFERENCED; + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + vm_page_activate(m); + continue; + } + if (m->dirty == 0) { + vm_page_test_dirty(m); + } else if (m->dirty != 0) { + m->dirty = VM_PAGE_BITS_ALL; + } + if (m->valid == 0) { vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); - ++cnt.v_dfree; + cnt.v_dfree++; ++pages_freed; } else if (m->dirty == 0) { vm_page_cache(m); ++pages_freed; } else if (maxlaunder > 0) { int written; struct vnode *vp = NULL; object = m->object; if (object->flags & OBJ_DEAD) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); continue; } if (object->type == OBJT_VNODE) { vp = object->handle; if (VOP_ISLOCKED(vp) || vget(vp, 1)) { if ((m->queue == PQ_INACTIVE) && (m->hold_count == 0) && (m->busy == 0) && (m->flags & PG_BUSY) == 0) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); } if (object->flags & OBJ_MIGHTBEDIRTY) ++vnodes_skipped; continue; } /* * The page might have been moved to another queue * during potential blocking in vget() above. */ if (m->queue != PQ_INACTIVE) { if (object->flags & OBJ_MIGHTBEDIRTY) ++vnodes_skipped; vput(vp); continue; } /* * The page may have been busied during the blocking in * vput(); We don't move the page back onto the end of * the queue so that statistics are more correct if we don't. */ if (m->busy || (m->flags & PG_BUSY)) { vput(vp); continue; } /* * If the page has become held, then skip it */ if (m->hold_count) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); if (object->flags & OBJ_MIGHTBEDIRTY) ++vnodes_skipped; vput(vp); continue; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ written = vm_pageout_clean(m, 0); if (vp) vput(vp); maxlaunder -= written; } } /* * Compute the page shortage. If we are still very low on memory be * sure that we will move a minimal amount of pages from active to * inactive. */ page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); if (page_shortage <= 0) { if (pages_freed == 0) { page_shortage = cnt.v_free_min - cnt.v_free_count; } else { page_shortage = 1; } } if (addl_page_shortage) { if (page_shortage < 0) page_shortage = 0; page_shortage += addl_page_shortage; } pcount = cnt.v_active_count; m = TAILQ_FIRST(&vm_page_queue_active); while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { int refcount; if (m->queue != PQ_ACTIVE) { break; } next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); m = next; continue; } /* * The count for pagedaemon pages is done after checking the * page for eligbility... */ cnt.v_pdpages++; refcount = 0; if (m->object->ref_count != 0) { if (m->flags & PG_REFERENCED) { refcount += 1; } - refcount += pmap_tc_referenced(VM_PAGE_TO_PHYS(m)); + refcount += pmap_ts_referenced(VM_PAGE_TO_PHYS(m)); if (refcount) { m->act_count += ACT_ADVANCE + refcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } } m->flags &= ~PG_REFERENCED; if (refcount && (m->object->ref_count != 0)) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); } else { m->act_count -= min(m->act_count, ACT_DECLINE); if (vm_pageout_algorithm_lru || (m->object->ref_count == 0) || (m->act_count == 0)) { --page_shortage; vm_page_protect(m, VM_PROT_NONE); if ((m->dirty == 0) && (m->object->ref_count == 0)) { vm_page_cache(m); } else { vm_page_deactivate(m); } } else { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); } } m = next; } s = splvm(); /* * We try to maintain some *really* free pages, this allows interrupt * code to be guaranteed space. */ while (cnt.v_free_count < cnt.v_free_reserved) { m = TAILQ_FIRST(&vm_page_queue_cache); if (!m) break; vm_page_free(m); cnt.v_dfree++; } splx(s); /* * If we didn't get enough free pages, and we have skipped a vnode * in a writeable object, wakeup the sync daemon. And kick swapout * if we did not get enough free pages. */ if ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_free_target + cnt.v_cache_min) ) { if (vnodes_skipped && (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) { if (!vfs_update_wakeup) { vfs_update_wakeup = 1; wakeup(&vfs_update_wakeup); } } #if !defined(NO_SWAPPING) if (vm_swapping_enabled && (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) { vm_req_vmdaemon(); vm_pageout_req_swapout = 1; } #endif } /* * make sure that we have swap space -- if we are low on memory and * swap -- then kill the biggest process. */ if ((vm_swap_size == 0 || swap_pager_full) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) { bigproc = NULL; bigsize = 0; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { /* * if this is a system process, skip it */ if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) || ((p->p_pid < 48) && (vm_swap_size != 0))) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get the process size */ size = p->p_vmspace->vm_pmap.pm_stats.resident_count; /* * if the this process is bigger than the biggest one * remember it. */ if (size > bigsize) { bigproc = p; bigsize = size; } } if (bigproc != NULL) { killproc(bigproc, "out of swap space"); bigproc->p_estcpu = 0; bigproc->p_nice = PRIO_MIN; resetpriority(bigproc); wakeup(&cnt.v_free_count); } } return force_wakeup; } static int vm_pageout_free_page_calc(count) vm_size_t count; { if (count < cnt.v_page_count) return 0; /* * free_reserved needs to include enough for the largest swap pager * structures plus enough for any pv_entry structs when paging. */ if (cnt.v_page_count > 1024) cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; else cnt.v_free_min = 4; cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + cnt.v_interrupt_free_min; cnt.v_free_reserved = vm_pageout_page_count + cnt.v_pageout_free_min + (count / 768); cnt.v_free_min += cnt.v_free_reserved; return 1; } #ifdef unused int vm_pageout_free_pages(object, add) vm_object_t object; int add; { return vm_pageout_free_page_calc(object->size); } #endif /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout() { (void) spl0(); /* * Initialize some paging parameters. */ cnt.v_interrupt_free_min = 2; if (cnt.v_page_count < 2000) vm_pageout_page_count = 8; vm_pageout_free_page_calc(cnt.v_page_count); /* * free_reserved needs to include enough for the largest swap pager * structures plus enough for any pv_entry structs when paging. */ cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved; if (cnt.v_free_count > 1024) { cnt.v_cache_max = (cnt.v_free_count - 1024) / 2; cnt.v_cache_min = (cnt.v_free_count - 1024) / 8; cnt.v_inactive_target = 2*cnt.v_cache_min + 192; } else { cnt.v_cache_min = 0; cnt.v_cache_max = 0; cnt.v_inactive_target = cnt.v_free_count / 4; } /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; swap_pager_swap_init(); /* * The pageout daemon is never done, so loop forever. */ while (TRUE) { int inactive_target; int s = splvm(); if (!vm_pages_needed || ((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) { vm_pages_needed = 0; tsleep(&vm_pages_needed, PVM, "psleep", 0); } else if (!vm_pages_needed) { tsleep(&vm_pages_needed, PVM, "psleep", hz/10); } inactive_target = (cnt.v_page_count - cnt.v_wire_count) / 4; if (inactive_target < 2*cnt.v_free_min) inactive_target = 2*cnt.v_free_min; cnt.v_inactive_target = inactive_target; if (vm_pages_needed) cnt.v_pdwakeups++; vm_pages_needed = 0; splx(s); vm_pager_sync(); vm_pageout_scan(); vm_pager_sync(); wakeup(&cnt.v_free_count); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon() { static int lastrun = 0; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } } static void vm_daemon() { vm_object_t object; struct proc *p; (void) spl0(); while (TRUE) { tsleep(&vm_daemon_needed, PUSER, "psleep", 0); if (vm_pageout_req_swapout) { swapout_procs(); vm_pageout_req_swapout = 0; } /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { quad_t limit; vm_offset_t size; /* * if this is a system process or if we have already * looked at this process, skip it. */ if (p->p_flag & (P_SYSTEM | P_WEXIT)) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get a limit */ limit = qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, p->p_rlimit[RLIMIT_RSS].rlim_max); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ size = p->p_vmspace->vm_pmap.pm_stats.resident_count * PAGE_SIZE; if (limit >= 0 && size >= limit) { vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, (vm_pindex_t)(limit >> PAGE_SHIFT) ); } } /* * we remove cached objects that have no RSS... */ restart: object = TAILQ_FIRST(&vm_object_cached_list); while (object) { /* * if there are no resident pages -- get rid of the object */ if (object->resident_page_count == 0) { vm_object_reference(object); pager_cache(object, FALSE); goto restart; } object = TAILQ_NEXT(object, cached_list); } } } #endif Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 17333) +++ head/sys/vm/vnode_pager.c (revision 17334) @@ -1,964 +1,964 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 - * $Id: vnode_pager.c,v 1.60 1996/05/03 21:01:54 phk Exp $ + * $Id: vnode_pager.c,v 1.61 1996/07/27 03:24:10 dyson Exp $ */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static vm_offset_t vnode_pager_addr __P((struct vnode *vp, vm_ooffset_t address, int *run)); static void vnode_pager_iodone __P((struct buf *bp)); static int vnode_pager_input_smlfs __P((vm_object_t object, vm_page_t m)); static int vnode_pager_input_old __P((vm_object_t object, vm_page_t m)); static void vnode_pager_dealloc __P((vm_object_t)); static int vnode_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static int vnode_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); static boolean_t vnode_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *)); struct pagerops vnodepagerops = { NULL, vnode_pager_alloc, vnode_pager_dealloc, vnode_pager_getpages, vnode_pager_putpages, vnode_pager_haspage, NULL }; static int vnode_pager_leaf_getpages __P((vm_object_t object, vm_page_t *m, int count, int reqpage)); static int vnode_pager_leaf_putpages __P((vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals)); /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_object_t vnode_pager_alloc(handle, size, prot, offset) void *handle; vm_size_t size; vm_prot_t prot; vm_ooffset_t offset; { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *) handle; /* * Prevent race condition when allocating the object. This * can happen with NFS vnodes since the nfsnode isn't locked. */ while (vp->v_flag & VOLOCK) { vp->v_flag |= VOWANT; tsleep(vp, PVM, "vnpobj", 0); } vp->v_flag |= VOLOCK; /* * If the object is being terminated, wait for it to * go away. */ while (((object = vp->v_object) != NULL) && (object->flags & OBJ_DEAD)) { tsleep(object, PVM, "vadead", 0); } if (object == NULL) { /* * And an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, size); if (vp->v_type == VREG) object->flags = OBJ_CANPERSIST; else object->flags = 0; /* * Hold a reference to the vnode and initialize object data. */ VREF(vp); object->un_pager.vnp.vnp_size = (vm_ooffset_t) size * PAGE_SIZE; object->handle = handle; vp->v_object = object; } else { /* * vm_object_reference() will remove the object from the cache if * found and gain a reference to the object. */ vm_object_reference(object); } if (vp->v_type == VREG) vp->v_flag |= VVMIO; vp->v_flag &= ~VOLOCK; if (vp->v_flag & VOWANT) { vp->v_flag &= ~VOWANT; wakeup(vp); } return (object); } static void vnode_pager_dealloc(object) vm_object_t object; { register struct vnode *vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); if (object->paging_in_progress) { int s = splbio(); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; tsleep(object, PVM, "vnpdea", 0); } splx(s); } object->handle = NULL; vp->v_object = NULL; vp->v_flag &= ~(VTEXT | VVMIO); vp->v_flag |= VAGE; vrele(vp); } static boolean_t vnode_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; /* * If filesystem no longer mounted or offset beyond end of file we do * not have the page. */ if ((vp->v_mount == NULL) || (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn, after, before); if (err) return TRUE; if ( bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { int numafter; *after *= pagesperblock; numafter = pagesperblock - (poff + 1); if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) { numafter = OFF_TO_IDX((object->un_pager.vnp.vnp_size - IDX_TO_OFF(pindex))); } *after += numafter; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(vp, nsize) struct vnode *vp; vm_ooffset_t nsize; { vm_object_t object = vp->v_object; if (object == NULL) return; /* * Hasn't changed size */ if (nsize == object->un_pager.vnp.vnp_size) return; /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nsize < object->un_pager.vnp.vnp_size) { vm_ooffset_t nsizerounded; nsizerounded = IDX_TO_OFF(OFF_TO_IDX(nsize + PAGE_MASK)); if (nsizerounded < object->un_pager.vnp.vnp_size) { vm_object_page_remove(object, OFF_TO_IDX(nsize + PAGE_MASK), OFF_TO_IDX(object->un_pager.vnp.vnp_size), FALSE); } /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode... */ if (nsize & PAGE_MASK) { vm_offset_t kva; vm_page_t m; m = vm_page_lookup(object, OFF_TO_IDX(nsize)); if (m) { kva = vm_pager_map_page(m); bzero((caddr_t) kva + (nsize & PAGE_MASK), (int) (round_page(nsize) - nsize)); vm_pager_unmap_page(kva); } } } object->un_pager.vnp.vnp_size = nsize; object->size = OFF_TO_IDX(nsize + PAGE_MASK); } void vnode_pager_umount(mp) register struct mount *mp; { struct vnode *vp, *nvp; loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * Vnode can be reclaimed by getnewvnode() while we * traverse the list. */ if (vp->v_mount != mp) goto loop; /* * Save the next pointer now since uncaching may terminate the * object and render vnode invalid */ nvp = vp->v_mntvnodes.le_next; if (vp->v_object != NULL) { VOP_LOCK(vp); vnode_pager_uncache(vp); VOP_UNLOCK(vp); } } } /* * Remove vnode associated object from the object cache. * This routine must be called with the vnode locked. * * XXX unlock the vnode. * We must do this since uncaching the object may result in its * destruction which may initiate paging activity which may necessitate * re-locking the vnode. */ void vnode_pager_uncache(vp) struct vnode *vp; { vm_object_t object; /* * Not a mapped vnode */ object = vp->v_object; if (object == NULL) return; vm_object_reference(object); VOP_UNLOCK(vp); pager_cache(object, FALSE); VOP_LOCK(vp); return; } void vnode_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static vm_offset_t vnode_pager_addr(vp, address, run) struct vnode *vp; vm_ooffset_t address; int *run; { int rtaddress; int bsize; daddr_t block; struct vnode *rtvp; int err; daddr_t vblock; int voffset; if ((int) address < 0) return -1; if (vp->v_mount == NULL) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL); if (err || (block == -1)) rtaddress = -1; else { rtaddress = block + voffset / DEV_BSIZE; if( run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return rtaddress; } /* * interrupt routine for I/O completion */ static void vnode_pager_iodone(bp) struct buf *bp; { bp->b_flags |= B_DONE; wakeup(bp); } /* * small block file system vnode pager input */ static int vnode_pager_input_smlfs(object, m) vm_object_t object; vm_page_t m; { int i; int s; struct vnode *dp, *vp; struct buf *bp; vm_offset_t kva; int fileaddr; vm_offset_t bsize; int error = 0; vp = object->handle; if (vp->v_mount == NULL) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &dp, 0, NULL, NULL); kva = vm_pager_map_page(m); for (i = 0; i < PAGE_SIZE / bsize; i++) { if ((vm_page_bits(IDX_TO_OFF(m->pindex) + i * bsize, bsize) & m->valid)) continue; fileaddr = vnode_pager_addr(vp, IDX_TO_OFF(m->pindex) + i * bsize, (int *)0); if (fileaddr != -1) { bp = getpbuf(); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva + i * bsize; bp->b_blkno = fileaddr; pbgetvp(dp, bp); bp->b_bcount = bsize; bp->b_bufsize = bsize; /* do the input */ VOP_STRATEGY(bp); /* we definitely need to be at splbio here */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "vnsrd", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); if (error) break; vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize); } else { vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize); bzero((caddr_t) kva + i * bsize, bsize); } } vm_pager_unmap_page(kva); - pmap_tc_modified(m); + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->flags &= ~PG_ZERO; if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager output routine */ static int vnode_pager_input_old(object, m) vm_object_t object; vm_page_t m; { struct uio auio; struct iovec aiov; int error; int size; vm_offset_t kva; error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ kva = vm_pager_map_page(m); aiov.iov_base = (caddr_t) kva; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_procp = (struct proc *) 0; error = VOP_READ(object->handle, &auio, 0, curproc->p_ucred); if (!error) { register int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t) kva + count, PAGE_SIZE - count); } vm_pager_unmap_page(kva); } - pmap_tc_modified(m); + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->dirty = 0; m->flags &= ~PG_ZERO; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ static int vnode_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { int rtval; struct vnode *vp; vp = object->handle; rtval = VOP_GETPAGES(vp, m, count*PAGE_SIZE, reqpage, 0); if (rtval == EOPNOTSUPP) return vnode_pager_leaf_getpages(object, m, count, reqpage); else return rtval; } static int vnode_pager_leaf_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { vm_offset_t kva; off_t foff; int i, size, bsize, first, firstaddr; struct vnode *dp, *vp; int runpg; int runend; struct buf *bp; int s; int error = 0; vp = object->handle; if (vp->v_mount == NULL) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; /* get the UNDERLYING device for the file with VOP_BMAP() */ /* * originally, we did not check for an error return value -- assuming * an fs always has a bmap entry point -- that assumption is wrong!!! */ foff = IDX_TO_OFF(m[reqpage]->pindex); /* * if we can't bmap, use old VOP code */ if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } cnt.v_vnodein++; cnt.v_vnodepgsin++; return vnode_pager_input_old(object, m[reqpage]); /* * if the blocksize is smaller than a page size, then use * special small filesystem code. NFS sometimes has a small * blocksize, but it can handle large reads itself. */ } else if ((PAGE_SIZE / bsize) > 1 && (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } cnt.v_vnodein++; cnt.v_vnodepgsin++; return vnode_pager_input_smlfs(object, m[reqpage]); } /* * if ANY DEV_BSIZE blocks are valid on a large filesystem block * then, the entire page is valid -- */ if (m[reqpage]->valid) { m[reqpage]->valid = VM_PAGE_BITS_ALL; for (i = 0; i < count; i++) { if (i != reqpage) vnode_pager_freepage(m[i]); } return VM_PAGER_OK; } /* * here on direct device I/O */ firstaddr = -1; /* * calculate the run that includes the required page */ for(first = 0, i = 0; i < count; i = runend) { firstaddr = vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &runpg); if (firstaddr == -1) { if (i == reqpage && foff < object->un_pager.vnp.vnp_size) { panic("vnode_pager_putpages: unexpected missing page: firstaddr: %d, foff: %ld, vnp_size: %d", firstaddr, foff, object->un_pager.vnp.vnp_size); } vnode_pager_freepage(m[i]); runend = i + 1; first = runend; continue; } runend = i + runpg; if (runend <= reqpage) { int j; for (j = i; j < runend; j++) { vnode_pager_freepage(m[j]); } } else { if (runpg < (count - first)) { for (i = first + runpg; i < count; i++) vnode_pager_freepage(m[i]); count = first + runpg; } break; } first = runend; } /* * the first and last page have been calculated now, move input pages * to be zero based... */ if (first != 0) { for (i = first; i < count; i++) { m[i - first] = m[i]; } count -= first; reqpage -= first; } /* * calculate the file virtual address for the transfer */ foff = IDX_TO_OFF(m[0]->pindex); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; if ((foff + size) > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - foff; /* * round up physical size for real devices */ if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); bp = getpbuf(); kva = (vm_offset_t) bp->b_data; /* * and map the pages to be read into the kva */ pmap_qenter(kva, m, count); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; /* B_PHYS is not set, but it is nice to fill this in */ bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_blkno = firstaddr; pbgetvp(dp, bp); bp->b_bcount = size; bp->b_bufsize = size; cnt.v_vnodein++; cnt.v_vnodepgsin += count; /* do the input */ VOP_STRATEGY(bp); s = splbio(); /* we definitely need to be at splbio here */ while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "vnread", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; if (!error) { if (size != count * PAGE_SIZE) bzero((caddr_t) kva + size, PAGE_SIZE * count - size); } pmap_qremove(kva, count); /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); for (i = 0; i < count; i++) { - pmap_tc_modified(m[i]); + pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->dirty = 0; m[i]->valid = VM_PAGE_BITS_ALL; m[i]->flags &= ~PG_ZERO; if (i != reqpage) { /* * whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere. (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * just in case someone was asking for this page we * now tell them that it is ok to use */ if (!error) { vm_page_deactivate(m[i]); PAGE_WAKEUP(m[i]); } else { vnode_pager_freepage(m[i]); } } } if (error) { printf("vnode_pager_getpages: I/O read error\n"); } return (error ? VM_PAGER_ERROR : VM_PAGER_OK); } static int vnode_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { int rtval; struct vnode *vp; vp = object->handle; rtval = VOP_PUTPAGES(vp, m, count*PAGE_SIZE, sync, rtvals, 0); if (rtval == EOPNOTSUPP) return vnode_pager_leaf_putpages(object, m, count, sync, rtvals); else return rtval; } /* * generic vnode pager output routine */ static int vnode_pager_leaf_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { int i; struct vnode *vp; int maxsize, ncount; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int error; vp = object->handle;; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; if ((int) m[0]->pindex < 0) { printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%x(%x)\n", m[0]->pindex, m[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(m[0]->pindex); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) maxsize = object->un_pager.vnp.vnp_size - poffset; else maxsize = 0; ncount = btoc(maxsize); if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } #ifdef BOGUS if (ncount == 0) { printf("vnode_pager_putpages: write past end of file: %d, %lu\n", poffset, (unsigned long) object->un_pager.vnp.vnp_size); return rtvals[0]; } #endif } } for (i = 0; i < count; i++) { m[i]->busy++; m[i]->flags &= ~PG_BUSY; } aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_procp = (struct proc *) 0; error = VOP_WRITE(vp, &auio, IO_VMIO|(sync?IO_SYNC:0), curproc->p_ucred); cnt.v_vnodeout++; cnt.v_vnodepgsout += ncount; if (error) { printf("vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { printf("vnode_pager_putpages: residual I/O %d at %ld\n", auio.uio_resid, m[0]->pindex); } for (i = 0; i < count; i++) { m[i]->busy--; if (i < ncount) { rtvals[i] = VM_PAGER_OK; } if ((m[i]->busy == 0) && (m[i]->flags & PG_WANTED)) wakeup(m[i]); } return rtvals[0]; } struct vnode * vnode_pager_lock(object) vm_object_t object; { for (; object != NULL; object = object->backing_object) { if (object->type != OBJT_VNODE) continue; VOP_LOCK(object->handle); return object->handle; } return NULL; }