Index: head/sys/alpha/alpha/pmap.c =================================================================== --- head/sys/alpha/alpha/pmap.c (revision 40699) +++ head/sys/alpha/alpha/pmap.c (revision 40700) @@ -1,3267 +1,3266 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 1998 Doug Rabson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp * with some ideas from NetBSD's alpha pmap - * $Id: pmap.c,v 1.10 1998/09/04 18:49:35 dfr Exp $ + * $Id: pmap.c,v 1.11 1998/10/21 11:38:06 dg Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ /* * Notes for alpha pmap. * * On alpha, pm_pdeobj will hold lev1, lev2 and lev3 page tables. * Indices from 0 to NUSERLEV3MAPS-1 will map user lev3 page tables, * indices from NUSERLEV3MAPS to NUSERLEV3MAPS+NUSERLEV2MAPS-1 will * map user lev2 page tables and index NUSERLEV3MAPS+NUSERLEV2MAPS * will map the lev1 page table. The lev1 table will self map at * address VADDR(PTLEV1I,0,0). * * The vm_object kptobj holds the kernel page tables on i386 (62 or 63 * of them, depending on whether the system is SMP). On alpha, kptobj * will hold the lev3 and lev2 page tables for K1SEG. Indices 0 to * NKLEV3MAPS-1 will map kernel lev3 page tables and indices * NKLEV3MAPS to NKLEV3MAPS+NKLEV2MAPS will map lev2 page tables. (XXX * should the kernel Lev1map be inserted into this object?). * * pvtmmap is not needed for alpha since K0SEG maps all of physical * memory. CADDR1 and CADDR2 are not needed for the same reason. The * only places outside pmap and machdep which use CADDR1 are xxdump * routines which use them for dumping physical pages. * * * alpha virtual memory map: * * * Address Lev1 index * * --------------------------------- * 0000000000000000 | | 0 * | | * | | * | | * | | * --- --- * User space (USEG) * --- --- * | | * | | * | | * | | * 000003ffffffffff | | 511=UMAXLEV1I * --------------------------------- * fffffc0000000000 | | 512=K0SEGLEV1I * | Kernel code/data/bss | * | | * | | * | | * --- --- * K0SEG * --- --- * | | * | 1-1 physical/virtual | * | | * | | * fffffdffffffffff | | * --------------------------------- * fffffe0000000000 | | 768=K1SEGLEV1I * | Kernel dynamic data | * | | * | | * | | * --- --- * K1SEG * --- --- * | | * | mapped by ptes | * | | * | | * fffffff7ffffffff | | * --------------------------------- * fffffffe00000000 | | 1023=PTLEV1I * | PTmap (pte self map) | * ffffffffffffffff | | * --------------------------------- * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if 0 #define PMAP_DIAGNOSTIC #define PMAP_DEBUG #endif #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #if 1 static void pmap_break(void) { } /* #define PMAP_DEBUG_VA(va) if ((va) == 0x120058000) pmap_break(); else */ #endif #ifndef PMAP_DEBUG_VA #define PMAP_DEBUG_VA(va) do {} while(0) #endif /* * Some macros for manipulating virtual addresses */ #define ALPHA_L1SIZE (1L << ALPHA_L1SHIFT) #define ALPHA_L2SIZE (1L << ALPHA_L2SHIFT) #define alpha_l1trunc(va) ((va) & ~(ALPHA_L1SIZE-1)) #define alpha_l2trunc(va) ((va) & ~(ALPHA_L2SIZE-1)) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pte_w(pte) ((*(pte) & PG_W) != 0) #define pmap_pte_managed(pte) ((*(pte) & PG_MANAGED) != 0) #define pmap_pte_v(pte) ((*(pte) & PG_V) != 0) #define pmap_pte_pa(pte) alpha_ptob(ALPHA_PTE_TO_PFN(*(pte))) #define pmap_pte_prot(pte) (*(pte) & PG_PROT) #define pmap_pte_set_w(pte, v) ((v)?(*pte |= PG_W):(*pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*pte &= ~PG_PROT), (*pte |= (v))) /* * Given a map and a machine independent protection code, * convert to an alpha protection code. */ #define pte_prot(m, p) (protection_codes[m == pmap_kernel() ? 0 : 1][p]) int protection_codes[2][8]; #define pa_index(pa) atop((pa) - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Return non-zero if this pmap is currently active */ #define pmap_isactive(pmap) (pmap->pm_active) /* * Extract level 1, 2 and 3 page table indices from a va */ #define PTMASK ((1 << ALPHA_PTSHIFT) - 1) #define pmap_lev1_index(va) (((va) >> ALPHA_L1SHIFT) & PTMASK) #define pmap_lev2_index(va) (((va) >> ALPHA_L2SHIFT) & PTMASK) #define pmap_lev3_index(va) (((va) >> ALPHA_L3SHIFT) & PTMASK) /* * Given a physical address, construct a pte */ #define pmap_phys_to_pte(pa) ALPHA_PTE_FROM_PFN(alpha_btop(pa)) /* * Given a page frame number, construct a k0seg va */ #define pmap_k0seg_to_pfn(va) alpha_btop(ALPHA_K0SEG_TO_PHYS(va)) /* * Given a pte, construct a k0seg va */ #define pmap_k0seg_to_pte(va) ALPHA_PTE_FROM_PFN(pmap_k0seg_to_pfn(va)) /* * Lev1map: * * Kernel level 1 page table. This maps all kernel level 2 * page table pages, and is used as a template for all user * pmap level 1 page tables. When a new user level 1 page * table is allocated, all Lev1map PTEs for kernel addresses * are copied to the new map. * * Lev2map: * * Initial set of kernel level 2 page table pages. These * map the kernel level 3 page table pages. As kernel * level 3 page table pages are added, more level 2 page * table pages may be added to map them. These pages are * never freed. * * Lev3map: * * Initial set of kernel level 3 page table pages. These * map pages in K1SEG. More level 3 page table pages may * be added at run-time if additional K1SEG address space * is required. These pages are never freed. * * Lev2mapsize: * * Number of entries in the initial Lev2map. * * Lev3mapsize: * * Number of entries in the initial Lev3map. * * NOTE: When mappings are inserted into the kernel pmap, all * level 2 and level 3 page table pages must already be allocated * and mapped into the parent page table. */ pt_entry_t *Lev1map, *Lev2map, *Lev3map; vm_size_t Lev2mapsize, Lev3mapsize; /* * Statically allocated kernel pmap */ static struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int pv_npg; static vm_object_t kptobj; static int nklev3, nklev2; vm_offset_t kernel_vm_end; /* * Data for the ASN allocator */ static int pmap_maxasn; static int pmap_nextasn = 0; static u_int pmap_current_asngen = 1; static pmap_t pmap_active = 0; /* * Data for the pv entry allocation mechanism */ static vm_zone_t pvzone; static struct vm_zone pvzone_store; static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; static pv_table_t *pv_table; caddr_t CADDR1; static caddr_t CADDR2; static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static pv_entry_t get_pv_entry __P((void)); static void alpha_protection_init __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static PMAP_INLINE int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_page_t mpte)); static int pmap_remove_pte __P((pmap_t pmap, pt_entry_t* ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); static vm_offset_t pmap_kmem_choose(vm_offset_t addr) ; void pmap_collect(void); /* * Routine: pmap_lev1pte * Function: * Extract the level 1 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev1pte(pmap_t pmap, vm_offset_t va) { if (!pmap) return 0; return &pmap->pm_lev1[pmap_lev1_index(va)]; } /* * Routine: pmap_lev2pte * Function: * Extract the level 2 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev2pte(pmap_t pmap, vm_offset_t va) { pt_entry_t* l1pte; pt_entry_t* l2map; l1pte = pmap_lev1pte(pmap, va); if (!pmap_pte_v(l1pte)) return 0; l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l1pte)); return &l2map[pmap_lev2_index(va)]; } /* * Routine: pmap_lev3pte * Function: * Extract the level 3 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev3pte(pmap_t pmap, vm_offset_t va) { pt_entry_t* l2pte; pt_entry_t* l3map; l2pte = pmap_lev2pte(pmap, va); if (!l2pte || !pmap_pte_v(l2pte)) return 0; l3map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l2pte)); return &l3map[pmap_lev3_index(va)]; } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_size_t bank_size; vm_offset_t pa, va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i+2]; i+= 2) { phys_avail[i] = phys_avail[i+2]; phys_avail[i+1] = phys_avail[i+3]; } phys_avail[i] = 0; phys_avail[i+1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; va = ALPHA_PHYS_TO_K0SEG(pa); bzero((caddr_t) va, size); return va; } extern pt_entry_t rom_pte; /* XXX */ extern int prom_mapped; /* XXX */ /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t ptaddr, u_int maxasn) { pt_entry_t newpte; pt_entry_t* pte; vm_offset_t va; int i; /* * Setup ASNs */ pmap_nextasn = 0; pmap_maxasn = maxasn; pmap_current_asngen = 1; /* * Allocate a level 1 map for the kernel. */ Lev1map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE); /* * Allocate a level 2 map for the kernel */ Lev2map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE); Lev2mapsize = PAGE_SIZE; /* * Allocate some level 3 maps for the kernel */ Lev3map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE*NKPT); Lev3mapsize = NKPT * PAGE_SIZE; /* Map all of the level 2 maps */ for (i = 0; i < howmany(Lev2mapsize, PAGE_SIZE); i++) { unsigned long pfn = pmap_k0seg_to_pfn((vm_offset_t) Lev2map) + i; newpte = ALPHA_PTE_FROM_PFN(pfn); newpte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_W; Lev1map[K1SEGLEV1I + i] = newpte; } /* Setup the mapping for the prom console */ { if (pmap_uses_prom_console()) { /* XXX save old pte so that we can remap prom if necessary */ rom_pte = *(pt_entry_t *)ptaddr & ~PG_ASM; /* XXX */ } prom_mapped = 0; /* * Actually, this code lies. The prom is still mapped, and will * remain so until the context switch after alpha_init() returns. * Printfs using the firmware before then will end up frobbing * Lev1map unnecessarily, but that's OK. */ } /* * Level 1 self mapping. * * Don't set PG_ASM since the self-mapping is different for each * address space. */ newpte = pmap_k0seg_to_pte((vm_offset_t) Lev1map); newpte |= PG_V | PG_KRE | PG_KWE; Lev1map[PTLEV1I] = newpte; /* Map all of the level 3 maps */ for (i = 0; i < howmany(Lev3mapsize, PAGE_SIZE); i++) { unsigned long pfn = pmap_k0seg_to_pfn((vm_offset_t) Lev3map) + i; newpte = ALPHA_PTE_FROM_PFN(pfn); newpte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_W; Lev2map[i] = newpte; } avail_start = phys_avail[0]; for (i = 0; phys_avail[i+2]; i+= 2) ; avail_end = phys_avail[i+1]; virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VPTBASE; /* * Initialize protection array. */ alpha_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_lev1 = Lev1map; kernel_pmap->pm_count = 1; kernel_pmap->pm_active = 1; kernel_pmap->pm_asn = 0; kernel_pmap->pm_asngen = pmap_current_asngen; pmap_nextasn = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nklev3 = NKPT; nklev2 = 1; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = pmap_lev3pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) virtual_avail = va; *CMAP1 = *CMAP2 = 0; /* * Set up proc0's PCB such that the ptbr points to the right place * and has the kernel pmap's. */ proc0.p_addr->u_pcb.pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t)Lev1map) >> PAGE_SHIFT; proc0.p_addr->u_pcb.pcb_hw.apcb_asn = 0; } int pmap_uses_prom_console() { #if 0 extern int cputype; #if defined(NEW_SCC_DRIVER) return (cputype == ST_DEC_21000); #else return (cputype == ST_DEC_21000 || cputype == ST_DEC_3000_300 || cputype == ST_DEC_3000_500); #endif /* NEW_SCC_DRIVER */ #endif return 1; } void pmap_setdevram(unsigned long long basea, vm_offset_t sizea) { } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t s; int i; int initial_pvs; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(pv_table_t) * pv_npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_table_t *) addr; for(i = 0; i < pv_npg; i++) { vm_offset_t pa; TAILQ_INIT(&pv_table[i].pv_list); pv_table[i].pv_list_count = 0; pa = vm_first_phys + i * PAGE_SIZE; pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa); } /* * init the pv free list */ initial_pvs = pv_npg; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg); /* * object for kernel page table pages */ kptobj = vm_object_allocate(OBJT_DEFAULT, NKLEV3MAPS + NKLEV2MAPS); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(vm_offset_t virt, vm_offset_t start, vm_offset_t end, int prot) { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /*************************************************** * Manipulate TLBs for a pmap ***************************************************/ static void pmap_invalidate_asn(pmap_t pmap) { pmap->pm_asngen = 0; } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap_isactive(pmap)) ALPHA_TBIS(va); else pmap_invalidate_asn(pmap); } static void pmap_invalidate_all(pmap_t pmap) { if (pmap_isactive(pmap)) ALPHA_TBIA(); else pmap_invalidate_asn(pmap); } static void pmap_get_asn(pmap_t pmap) { if (pmap->pm_asngen != pmap_current_asngen) { if (pmap_nextasn > pmap_maxasn) { /* * Start a new ASN generation. * * Invalidate all per-process mappings and I-cache */ pmap_nextasn = 0; pmap_current_asngen++; if (pmap_current_asngen == 0) { /* * Clear the pm_asngen of all pmaps. * This is safe since it is only called from * pmap_activate after it has deactivated * the old pmap. */ struct proc *p; pmap_t tpmap; #ifdef PMAP_DIAGNOSTIC printf("pmap_get_asn: generation rollover\n"); #endif pmap_current_asngen = 1; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { tpmap = &p->p_vmspace->vm_pmap; tpmap->pm_asngen = 0; } } } /* * Since we are about to start re-using ASNs, we must * clear out the TLB and the I-cache since they are tagged * with the ASN. */ ALPHA_TBIAP(); alpha_pal_imb(); } pmap->pm_asn = pmap_nextasn++; pmap->pm_asngen = pmap_current_asngen; } } /*************************************************** * Low level helper routines..... ***************************************************/ /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < clean_sva) || (va >= clean_eva)) return 1; else return 0; } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { pt_entry_t* pte = pmap_lev3pte(pmap, va); if (pte) return alpha_ptob(ALPHA_PTE_TO_PFN(*pte)); else return 0; } /* * determine if a page is managed (memory vs. device) */ static PMAP_INLINE int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; pt_entry_t *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; pt_entry_t npte = pmap_phys_to_pte(VM_PAGE_TO_PHYS(m[i])) | PG_ASM | PG_KRE | PG_KWE | PG_V; pt_entry_t opte; pte = vtopte(tva); opte = *pte; PMAP_DEBUG_VA(va); *pte = npte; if (opte) pmap_invalidate_page(kernel_pmap, tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va); PMAP_DEBUG_VA(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_offset_t pa) { pt_entry_t *pte; pt_entry_t npte, opte; npte = pmap_phys_to_pte(pa) | PG_ASM | PG_KRE | PG_KWE | PG_V; pte = vtopte(va); opte = *pte; PMAP_DEBUG_VA(va); *pte = npte; if (opte) pmap_invalidate_page(kernel_pmap, va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { register pt_entry_t *pte; pte = vtopte(va); PMAP_DEBUG_VA(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); } static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep(m, "pplookp", NULL)) goto retry; return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(struct proc *p) { int i; vm_object_t upobj; vm_page_t m; struct user *up; pt_entry_t *ptek, oldpte; /* * allocate object for the upages */ if ((upobj = p->p_upages_obj) == NULL) { upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; } /* get a kernel virtual address for the UPAGES for this proc */ if ((up = p->p_addr) == NULL) { up = (struct user *) kmem_alloc_pageable(kernel_map, UPAGES * PAGE_SIZE); #if !defined(MAX_PERF) if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); #endif p->p_addr = up; } ptek = vtopte((vm_offset_t) up); for(i=0;iwire_count++; cnt.v_wire_count++; oldpte = *(ptek + i); /* * Enter the page into the kernel address space. */ *(ptek + i) = pmap_phys_to_pte(VM_PAGE_TO_PHYS(m)) | PG_ASM | PG_KRE | PG_KWE | PG_V; if (oldpte) pmap_invalidate_page(kernel_pmap, (vm_offset_t)up + i * PAGE_SIZE); vm_page_wakeup(m); vm_page_flag_clear(m, PG_ZERO); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); m->valid = VM_PAGE_BITS_ALL; } } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; pt_entry_t *ptek, oldpte; upobj = p->p_upages_obj; ptek = vtopte((vm_offset_t) p->p_addr); for(i=0;ip_addr + i * PAGE_SIZE); - vm_page_unwire(m); + vm_page_unwire(m, 0); vm_page_free(m); } } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;idirty = VM_PAGE_BITS_ALL; - vm_page_unwire(m); - vm_page_deactivate(m); + vm_page_unwire(m, 0); pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i,rv; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;ip_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); #if !defined(MAX_PERF) if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); #endif m = vm_page_lookup(upobj, i); m->valid = VM_PAGE_BITS_ALL; } vm_page_wire(m); vm_page_wakeup(m); vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); } /* * The pcb may be at a different physical address now so cache the * new address. */ p->p_md.md_pcbpaddr = (void*) vtophys((vm_offset_t) &p->p_addr->u_pcb); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { int s; while (vm_page_sleep(m, "pmuwpt", NULL)); if (m->hold_count == 0) { vm_offset_t pteva; pt_entry_t* pte; int level; /* * unmap the page table page */ if (m->pindex >= NUSERLEV3MAPS) { /* Level 2 page table */ pte = pmap_lev1pte(pmap, va); pteva = (vm_offset_t) PTlev2 + alpha_ptob(m->pindex - NUSERLEV3MAPS); } else { /* Level 3 page table */ pte = pmap_lev2pte(pmap, va); pteva = (vm_offset_t) PTmap + alpha_ptob(m->pindex); } *pte = 0; if (m->pindex < NUSERLEV3MAPS) { /* unhold the level 2 page table */ vm_page_t lev2pg; lev2pg = pmap_page_lookup(pmap->pm_pteobj, NUSERLEV3MAPS + pmap_lev1_index(va)); vm_page_unhold(lev2pg); if (lev2pg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, lev2pg); } --pmap->pm_stats.resident_count; /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pmap_invalidate_page(pmap, pteva); if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { if (m->flags & PG_WANTED) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } vm_page_flag_set(m, PG_BUSY); vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, va, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { unsigned ptepindex; if (va >= VM_MAXUSER_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> ALPHA_L2SHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, va, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_lev1 = Lev1map; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; pmap->pm_active = 0; pmap->pm_asn = 0; pmap->pm_asngen = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t lev1pg; /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUSERLEV3MAPS + NUSERLEV2MAPS + 1); /* * allocate the page directory page */ retry: lev1pg = vm_page_grab(pmap->pm_pteobj, NUSERLEV3MAPS + NUSERLEV2MAPS, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); lev1pg->wire_count = 1; ++cnt.v_wire_count; vm_page_flag_clear(lev1pg, PG_MAPPED | PG_BUSY); /* not mapped normally */ lev1pg->valid = VM_PAGE_BITS_ALL; pmap->pm_lev1 = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(lev1pg)); if ((lev1pg->flags & PG_ZERO) == 0) bzero(pmap->pm_lev1, PAGE_SIZE); /* wire in kernel global address entries */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTlev1 + K1SEGLEV1I, pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); /* install self-referential address mapping entry (not PG_ASM) */ pmap->pm_lev1[PTLEV1I] = pmap_phys_to_pte(VM_PAGE_TO_PHYS(lev1pg)) | PG_V | PG_KRE | PG_KWE; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; pmap->pm_active = 0; pmap->pm_asn = 0; pmap->pm_asngen = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static int pmap_release_free_page(pmap_t pmap, vm_page_t p) { int s; pt_entry_t* pte; pt_entry_t* l2map; if (p->pindex >= NUSERLEV3MAPS + NUSERLEV2MAPS) /* level 1 page table */ pte = &pmap->pm_lev1[PTLEV1I]; else if (p->pindex >= NUSERLEV3MAPS) /* level 2 page table */ pte = &pmap->pm_lev1[p->pindex - NUSERLEV3MAPS]; else { /* level 3 page table */ pte = &pmap->pm_lev1[p->pindex >> ALPHA_PTSHIFT]; l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(pte)); pte = &l2map[p->pindex & ((1 << ALPHA_PTSHIFT) - 1)]; } /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ if (vm_page_sleep(p, "pmaprl", NULL)) return 0; vm_page_flag_set(p, PG_BUSY); /* * Remove the page table page from the processes address space. */ *pte = 0; pmap->pm_stats.resident_count--; #ifdef PMAP_DEBUG if (p->hold_count) { panic("pmap_release: freeing held page table page"); } #endif /* * Level1 pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == NUSERLEV3MAPS + NUSERLEV2MAPS) bzero(pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; #ifdef PMAP_DEBUG { u_long *lp = (u_long*) ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(p)); u_long *ep = (u_long*) ((char*) lp + PAGE_SIZE); for (; lp < ep; lp++) if (*lp != 0) panic("pmap_release_free_page: page not zero"); } #endif p->wire_count--; cnt.v_wire_count--; vm_page_free_zero(p); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { pt_entry_t* pte; vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO | VM_ALLOC_RETRY); if (m->queue != PQ_NONE) { int s = splvm(); vm_page_unqueue(m); splx(s); } if (m->wire_count == 0) cnt.v_wire_count++; m->wire_count++; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); if (ptepindex >= NUSERLEV3MAPS) { pte = &pmap->pm_lev1[ptepindex - NUSERLEV3MAPS]; } else { int l1index = ptepindex >> ALPHA_PTSHIFT; pt_entry_t* l1pte = &pmap->pm_lev1[l1index]; pt_entry_t* l2map; if (!pmap_pte_v(l1pte)) _pmap_allocpte(pmap, NUSERLEV3MAPS + l1index); else { int l2ptepindex = NUSERLEV3MAPS + l1index; vm_page_t l2page = pmap_page_lookup(pmap->pm_pteobj, NUSERLEV3MAPS + l1index); l2page->hold_count++; } l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l1pte)); pte = &l2map[ptepindex & ((1 << ALPHA_PTSHIFT) - 1)]; } *pte = pmap_phys_to_pte(ptepa) | PG_KRE | PG_KWE | PG_V; /* * Set the page table hint */ pmap->pm_ptphint = m; if ((m->flags & PG_ZERO) == 0) bzero((caddr_t) ALPHA_PHYS_TO_K0SEG(ptepa), PAGE_SIZE); m->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(m, PG_ZERO | PG_BUSY); vm_page_flag_set(m, PG_MAPPED); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { unsigned ptepindex; pt_entry_t* lev2pte; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> (PAGE_SHIFT + ALPHA_PTSHIFT); /* * Get the level2 entry */ lev2pte = pmap_lev2pte(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (lev2pte && pmap_pte_v(lev2pte)) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t p,n,lev1pg; vm_object_t object = pmap->pm_pteobj; int curgeneration; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif lev1pg = NULL; retry: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex >= NUSERLEV3MAPS) { continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex < NUSERLEV3MAPS) { /* can this happen? maybe panic */ goto retry; } if (p->pindex >= NUSERLEV3MAPS + NUSERLEV2MAPS) { lev1pg = p; continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } if (lev1pg && !pmap_release_free_page(pmap, lev1pg)) goto retry; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { /* XXX come back to this */ struct proc *p; struct pmap *pmap; int s; pt_entry_t* pte; pt_entry_t newlev1, newlev2; vm_offset_t pa; vm_page_t nkpg; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = VM_MIN_KERNEL_ADDRESS;; /* Count the level 2 page tables */ nklev2 = 0; nklev3 = 0; while (pmap_pte_v(pmap_lev1pte(kernel_pmap, kernel_vm_end))) { nklev2++; nklev3 += (1L << ALPHA_PTSHIFT); kernel_vm_end += ALPHA_L1SIZE; } /* Count the level 3 page tables in the last level 2 page table */ kernel_vm_end -= ALPHA_L1SIZE; nklev3 -= (1 << ALPHA_PTSHIFT); while (pmap_pte_v(pmap_lev2pte(kernel_pmap, kernel_vm_end))) { nklev3++; kernel_vm_end += ALPHA_L2SIZE; } } addr = (addr + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); while (kernel_vm_end < addr) { /* * If the level 1 pte is invalid, allocate a new level 2 page table */ pte = pmap_lev1pte(kernel_pmap, kernel_vm_end); if (!pmap_pte_v(pte)) { int pindex = NKLEV3MAPS + pmap_lev1_index(kernel_vm_end) - K1SEGLEV1I; nkpg = vm_page_alloc(kptobj, pindex, VM_ALLOC_SYSTEM); #if !defined(MAX_PERF) if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); #endif printf("pmap_growkernel: growing to %lx\n", addr); printf("pmap_growkernel: adding new level2 page table\n"); nklev2++; vm_page_wire(nkpg); pa = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(pa); newlev1 = pmap_phys_to_pte(pa) | PG_V | PG_ASM | PG_KRE | PG_KWE; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_lev1pte(pmap, kernel_vm_end) = newlev1; } } *pte = newlev1; pmap_invalidate_all(kernel_pmap); } /* * If the level 2 pte is invalid, allocate a new level 3 page table */ pte = pmap_lev2pte(kernel_pmap, kernel_vm_end); if (pmap_pte_v(pte)) { kernel_vm_end = (kernel_vm_end + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kptobj, nklev3, VM_ALLOC_SYSTEM); #if !defined(MAX_PERF) if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); #endif nklev3++; vm_page_wire(nkpg); pa = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(pa); newlev2 = pmap_phys_to_pte(pa) | PG_V | PG_ASM | PG_KRE | PG_KWE; *pte = newlev2; kernel_vm_end = (kernel_vm_end + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap_t pmap) { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); #if !defined(MAX_PERF) panic("destroying a pmap is not yet implemented"); #endif } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap_t pmap) { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; zfreei(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloci(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { pv_table_t *ppv; int i; vm_offset_t pa; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < pv_npg; i++) { if ((ppv = &pv_table[i]) == 0) continue; m = ppv->pv_vm_page; if ((pa = VM_PAGE_TO_PHYS(m)) == 0) continue; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(pa); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap_t pmap, pv_table_t* ppv, vm_offset_t va) { pv_entry_t pv; int rtval; int s; s = splvm(); if (ppv->pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; if (TAILQ_FIRST(&ppv->pv_list) == NULL) vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa) { int s; pv_entry_t pv; pv_table_t *ppv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); ppv = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); ppv->pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t* ptq, vm_offset_t va) { pt_entry_t oldpte; pv_table_t *ppv; oldpte = *ptq; PMAP_DEBUG_VA(va); *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { ppv = pa_to_pvh(pmap_pte_pa(&oldpte)); return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { register pt_entry_t *ptq; ptq = pmap_lev3pte(pmap, va); /* * if there is no pte for this address, just skip it!!! */ if (!ptq || !pmap_pte_v(ptq)) return; /* * get a local va for mappings for this pmap. */ (void) pmap_remove_pte(pmap, ptq, va); pmap_invalidate_page(pmap, va); return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va, nva; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pmap_remove_page(pmap, sva); return; } for (va = sva; va < eva; va = nva) { if (!pmap_pte_v(pmap_lev1pte(pmap, va))) { nva = alpha_l1trunc(va + ALPHA_L1SIZE); continue; } if (!pmap_pte_v(pmap_lev2pte(pmap, va))) { nva = alpha_l2trunc(va + ALPHA_L2SIZE); continue; } pmap_remove_page(pmap, va); nva = va + PAGE_SIZE; } } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(vm_offset_t pa) { register pv_entry_t pv; pv_table_t *ppv; pt_entry_t *pte, tpte; int nmodify; int s; nmodify = 0; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", pa); } #endif s = splvm(); ppv = pa_to_pvh(pa); while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) { pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; if (pmap_pte_pa(pte) != pa) panic("pmap_remove_all: pv_table for %x is inconsistent", pa); tpte = *pte; PMAP_DEBUG_VA(pv->pv_va); *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); splx(s); return; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t* pte; vm_offset_t pdnxt, ptpaddr; int newprot; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; newprot = pte_prot(pmap, prot); if ((sva & PAGE_MASK) || (eva & PAGE_MASK)) panic("pmap_protect: unaligned addresses"); while (sva < eva) { pt_entry_t pbits; /* * If level 1 pte is invalid, skip this segment */ pte = pmap_lev1pte(pmap, sva); if (!pmap_pte_v(pte)) { sva = alpha_l1trunc(sva) + ALPHA_L1SIZE; continue; } /* * If level 2 pte is invalid, skip this segment */ pte = pmap_lev2pte(pmap, sva); if (!pmap_pte_v(pte)) { sva = alpha_l2trunc(sva) + ALPHA_L2SIZE; continue; } /* * If level 3 pte is invalid, skip this page */ pte = pmap_lev3pte(pmap, sva); if (!pmap_pte_v(pte)) { sva += PAGE_SIZE; continue; } if (pmap_pte_prot(pte) != newprot) { pmap_pte_set_prot(pte, newprot); pmap_invalidate_page(pmap, sva); } sva += PAGE_SIZE; } } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot, boolean_t wired) { pt_entry_t *pte; vm_offset_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; int managed; if (pmap == NULL) return; va &= ~PAGE_MASK; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } pte = pmap_lev3pte(pmap, va); #if !defined(MAX_PERF) /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid kernel page tables pmap=%p, va=0x%lx\n", pmap, va); } #endif origpte = *pte; pa &= ~PAGE_MASK; managed = 0; opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; managed = origpte & PG_MANAGED; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); #if !defined(MAX_PERF) if (err) panic("pmap_enter: pte vanished, va: 0x%lx", va); #endif } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); managed |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = pmap_phys_to_pte(pa) | pte_prot(pmap, prot) | PG_V | managed; if (managed) { pv_table_t* ppv; /* * Set up referenced/modified emulation for the new mapping */ ppv = pa_to_pvh(pa); if ((ppv->pv_flags & PV_TABLE_REF) == 0) newpte |= PG_FOR | PG_FOW | PG_FOE; else if ((ppv->pv_flags & PV_TABLE_MOD) == 0) newpte |= PG_FOW; } if (wired) newpte |= PG_W; /* * if the mapping or permission bits are different, we need * to update the pte. */ if (origpte != newpte) { PMAP_DEBUG_VA(va); *pte = newpte; if (origpte) pmap_invalidate_page(pmap, va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_page_t mpte) { register pt_entry_t *pte; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pt_entry_t* l2pte; /* * Calculate lev2 page index */ ptepindex = va >> ALPHA_L2SHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the level 2 entry */ l2pte = pmap_lev2pte(pmap, va); /* * If the level 2 page table is mapped, we just increment * the hold count, and activate it. */ if (l2pte && pmap_pte_v(l2pte)) { if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, va, mpte); return 0; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ PMAP_DEBUG_VA(va); pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pmap_phys_to_pte(pa) | PG_V | PG_KRE | PG_URE | PG_MANAGED; return mpte; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int limit) { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (!pmap) return; psize = alpha_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_flag_set(p, PG_BUSY); mpte = pmap_enter_quick(pmap, addr + alpha_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_flag_set(p, PG_BUSY); mpte = pmap_enter_quick(pmap, addr + alpha_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); vm_page_flag_set(p, PG_MAPPED); vm_page_wakeup(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; pt_entry_t *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_pte_v(pmap_lev1pte(pmap, addr)) || !pmap_pte_v(pmap_lev2pte(pmap, addr))) continue; pte = vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } vm_page_flag_set(m, PG_BUSY); mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_lev3pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(vm_offset_t pa) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(pa); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_offset_t src, vm_offset_t dst) { src = ALPHA_PHYS_TO_K0SEG(src); dst = ALPHA_PHYS_TO_K0SEG(dst); bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; pv_table_t *ppv; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif if (!pmap_pte_v(pte)) panic("pmap_remove_pages: page on pm_pvlist has no pte\n"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } PMAP_DEBUG_VA(pv->pv_va); *pte = 0; ppv = pa_to_pvh(pmap_pte_pa(&tpte)); pv->pv_pmap->pm_stats.resident_count--; npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); ppv->pv_list_count--; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { vm_page_flag_clear(ppv->pv_vm_page, PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); pmap_invalidate_all(pmap); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(vm_offset_t pa, int bit) { register pv_entry_t pv; pv_table_t *ppv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } #endif pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(vm_offset_t pa, int bit, boolean_t setem) { pv_entry_t pv; pv_table_t *ppv; pt_entry_t *pte; int changed; int s; if (!pmap_is_managed(pa)) return; s = splvm(); changed = 0; ppv = pa_to_pvh(pa); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && bit == (PG_UWE|PG_KWE)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); continue; } #endif pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); changed = 0; if (setem) { *pte |= bit; changed = 1; } else { pt_entry_t pbits = *pte; if (pbits & bit) { changed = 1; *pte = pbits & ~bit; } } if (changed) pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } splx(s); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_offset_t phys, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(phys, PG_KWE|PG_UWE, FALSE); } else { pmap_remove_all(phys); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (alpha_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. * */ int pmap_ts_referenced(vm_offset_t pa) { pv_table_t *ppv; int ret; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); ret = (ppv->pv_flags & PV_TABLE_REF) != 0; ppv->pv_flags &= ~PV_TABLE_REF; return ret; } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { pv_table_t *ppv; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); return (ppv->pv_flags & PV_TABLE_MOD) != 0; } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pv_table_t *ppv; if (!pmap_is_managed(pa)) return; ppv = pa_to_pvh(pa); if (ppv->pv_flags & PV_TABLE_MOD) { pmap_changebit(pa, PG_FOW, TRUE); ppv->pv_flags &= ~PV_TABLE_MOD; } } /* * pmap_page_is_free: * * Called when a page is freed to allow pmap to clean up * any extra state associated with the page. In this case * clear modified/referenced bits. */ void pmap_page_is_free(vm_page_t m) { pv_table_t *ppv; ppv = pa_to_pvh(VM_PAGE_TO_PHYS(m)); ppv->pv_flags = 0; } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pv_table_t *ppv; if (!pmap_is_managed(pa)) return; ppv = pa_to_pvh(pa); if (ppv->pv_flags & PV_TABLE_REF) { pmap_changebit(pa, PG_FOR|PG_FOE|PG_FOW, TRUE); ppv->pv_flags &= ~PV_TABLE_REF; } } /* * pmap_emulate_reference: * * Emulate reference and/or modified bit hits. * From NetBSD */ void pmap_emulate_reference(struct proc *p, vm_offset_t v, int user, int write) { pt_entry_t faultoff, *pte; vm_offset_t pa; pv_table_t *ppv; /* * Convert process and virtual address to physical address. */ if (v >= VM_MIN_KERNEL_ADDRESS) { if (user) panic("pmap_emulate_reference: user ref to kernel"); pte = vtopte(v); } else { #ifdef DIAGNOSTIC if (p == NULL) panic("pmap_emulate_reference: bad proc"); if (p->p_vmspace == NULL) panic("pmap_emulate_reference: bad p_vmspace"); #endif pte = pmap_lev3pte(p->p_vmspace->vm_map.pmap, v); } #ifdef DEBUG /* These checks are more expensive */ if (!pmap_pte_v(pte)) panic("pmap_emulate_reference: invalid pte"); #if 0 /* * Can't do these, because cpu_fork and cpu_swapin call * pmap_emulate_reference(), and the bits aren't guaranteed, * for them... */ if (write) { if (!(*pte & (user ? PG_UWE : PG_UWE | PG_KWE))) panic("pmap_emulate_reference: write but unwritable"); if (!(*pte & PG_FOW)) panic("pmap_emulate_reference: write but not FOW"); } else { if (!(*pte & (user ? PG_URE : PG_URE | PG_KRE))) panic("pmap_emulate_reference: !write but unreadable"); if (!(*pte & (PG_FOR | PG_FOE))) panic("pmap_emulate_reference: !write but not FOR|FOE"); } #endif /* Other diagnostics? */ #endif pa = pmap_pte_pa(pte); #ifdef DIAGNOSTIC if ((*pte & PG_MANAGED) == 0) panic("pmap_emulate_reference(%p, 0x%lx, %d, %d): pa 0x%lx not managed", p, v, user, write, pa); #endif /* * Twiddle the appropriate bits to reflect the reference * and/or modification.. * * The rules: * (1) always mark page as used, and * (2) if it was a write fault, mark page as modified. */ ppv = pa_to_pvh(pa); ppv->pv_flags = PV_TABLE_REF; faultoff = PG_FOR | PG_FOE; vm_page_flag_set(ppv->pv_vm_page, PG_REFERENCED); if (write) { ppv->pv_flags |= PV_TABLE_MOD; ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; faultoff |= PG_FOW; } pmap_changebit(pa, faultoff, FALSE); if ((*pte & faultoff) != 0) { #if 1 /* * XXX dfr - don't think its possible in our pmap */ /* * This is apparently normal. Why? -- cgd * XXX because was being called on unmanaged pages? */ panic("warning: pmap_changebit didn't."); #endif *pte &= ~faultoff; ALPHA_TBIS(v); } } /* * Miscellaneous support routines follow */ static void alpha_protection_init() { int prot, *kp, *up; kp = protection_codes[0]; up = protection_codes[1]; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: *kp++ = PG_ASM; *up++ = 0; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = PG_ASM | PG_KRE; *up++ = PG_URE | PG_KRE; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: *kp++ = PG_ASM | PG_KWE; *up++ = PG_UWE | PG_KWE; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_ASM | PG_KWE | PG_KRE; *up++ = PG_UWE | PG_URE | PG_KWE | PG_KRE; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { return (void*) ALPHA_PHYS_TO_K0SEG(pa); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *pte; vm_page_t m; int val = 0; pte = pmap_lev3pte(pmap, addr); if (pte == 0) { return 0; } if (pmap_pte_v(pte)) { pv_table_t *ppv; vm_offset_t pa; val = MINCORE_INCORE; if ((*pte & PG_MANAGED) == 0) return val; pa = pmap_pte_pa(pte); ppv = pa_to_pvh(pa); m = ppv->pv_vm_page; /* * Modified by us */ if (ppv->pv_flags & PV_TABLE_MOD) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (m->dirty || pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (ppv->pv_flags & PV_TABLE_REF) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } } return val; } void pmap_activate(struct proc *p) { pmap_t pmap; pmap = &p->p_vmspace->vm_pmap; if (pmap_active && pmap != pmap_active) { pmap_active->pm_active = 0; pmap_active = 0; } p->p_addr->u_pcb.pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t) pmap->pm_lev1) >> PAGE_SHIFT; if (pmap->pm_asngen != pmap_current_asngen) pmap_get_asn(pmap); pmap_active = pmap; pmap->pm_active = 1; /* XXX use bitmap for SMP */ p->p_addr->u_pcb.pcb_hw.apcb_asn = pmap->pm_asn; if (p == curproc) { alpha_pal_swpctx((u_long)p->p_md.md_pcbpaddr); } } void pmap_deactivate(struct proc *p) { pmap_t pmap; pmap = &p->p_vmspace->vm_pmap; pmap->pm_active = 0; pmap_active = 0; } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return addr; } #if 0 #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; pt_entry_t *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { int i, j; vm_offset_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { pv_table_t *ppv; register pv_entry_t pv; printf("pa %x", pa); ppv = pa_to_pvh(pa); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif #endif Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 40699) +++ head/sys/amd64/amd64/pmap.c (revision 40700) @@ -1,3545 +1,3544 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.209 1998/09/06 23:04:20 tegge Exp $ + * $Id: pmap.c,v 1.210 1998/10/21 11:38:14 dg Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_disable_pse.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; #define pa_index(pa) atop((pa) - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) static struct pmap kernel_pmap_store; pmap_t kernel_pmap; extern pd_entry_t my_idlePTD; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int pgeflag; /* PG_G or-in */ static int pseflag; /* PG_PS or-in */ static int pv_npg; static vm_object_t kptobj; static int nkpt; vm_offset_t kernel_vm_end; /* * Data for the pv entry allocation mechanism */ static vm_zone_t pvzone; static struct vm_zone pvzone_store; static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; static pv_table_t *pv_table; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp=0; #ifdef SMP extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[]; extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3; extern pd_entry_t *IdlePTDS[]; extern pt_entry_t SMP_prvpt[]; #endif #ifdef SMP extern unsigned int prv_PPAGE1[]; extern pt_entry_t *prv_PMAP1; #else static pt_entry_t *PMAP1 = 0; static unsigned *PADDR1 = 0; #endif static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static PMAP_INLINE int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_page_t mpte)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); void pmap_collect(void); static unsigned pdir4mb; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ PMAP_INLINE unsigned * pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned *pdeaddr; if (pmap) { pdeaddr = (unsigned *) pmap_pde(pmap, va); if (*pdeaddr & PG_PS) return pdeaddr; if (*pdeaddr) { return get_ptbase(pmap) + i386_btop(va); } } return (0); } /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); } #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i, j; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); kernel_pmap->pm_count = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(MSGBUF_SIZE))) #if !defined(SMP) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); #endif virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = 0; *(int *) PTD = 0; pgeflag = 0; #if !defined(SMP) if (cpu_feature & CPUID_PGE) { pgeflag = PG_G; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #if !defined(DISABLE_PSE) if (cpu_feature & CPUID_PSE) { unsigned ptditmp; /* * Enable the PSE mode */ load_cr4(rcr4() | CR4_PSE); /* * Note that we have enabled PSE mode */ pseflag = PG_PS; ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE)); ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; /* * We can do the mapping here for the single processor * case. We simply ignore the old page table page from * now on. */ #if !defined(SMP) PTD[KPTDI] = (pd_entry_t) ptditmp; kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; invltlb(); #endif } #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic!"); /* 0 = private page */ /* 1 = page table page */ /* 2 = local apic */ /* 16-31 = io apics */ SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (cpu_apic_address & PG_FRAME)); for (i = 0; i < mp_napics; i++) { for (j = 0; j < 16; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMP_prvpt[j + 16] & PG_FRAME) == (io_apic_address[0] & PG_FRAME)) { ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } /* use this slot if available */ if (((vm_offset_t)SMP_prvpt[j + 16] & PG_FRAME) == 0) { SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } } if (j == 16) panic("no space to map IO apic %d!", i); } /* BSP does this itself, AP's get it pre-set */ prv_CMAP1 = &SMP_prvpt[3 + UPAGES]; prv_CMAP2 = &SMP_prvpt[4 + UPAGES]; prv_CMAP3 = &SMP_prvpt[5 + UPAGES]; prv_PMAP1 = &SMP_prvpt[6 + UPAGES]; #endif invltlb(); } void getmtrr() { int i; if (cpu == CPU_686) { for(i = 0; i < NPPROVMTRR; i++) { PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2); PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2); } } } void putmtrr() { int i; if (cpu == CPU_686) { wbinvd(); for(i = 0; i < NPPROVMTRR; i++) { wrmsr(PPRO_VMTRRphysBase0 + i * 2, PPro_vmtrr[i].base); wrmsr(PPRO_VMTRRphysMask0 + i * 2, PPro_vmtrr[i].mask); } } } void pmap_setvidram(void) { #if 0 if (cpu == CPU_686) { wbinvd(); /* * Set memory between 0-640K to be WB */ wrmsr(0x250, 0x0606060606060606LL); wrmsr(0x258, 0x0606060606060606LL); /* * Set normal, PC video memory to be WC */ wrmsr(0x259, 0x0101010101010101LL); } #endif } void pmap_setdevram(unsigned long long basea, vm_offset_t sizea) { int i, free, skip; unsigned basepage, basepaget; unsigned long long base; unsigned long long mask; if (cpu != CPU_686) return; free = -1; skip = 0; basea &= ~0xfff; base = basea | 0x1; mask = (long long) (0xfffffffffLL - ((long) sizea - 1)) | (long long) 0x800; mask &= ~0x7ff; basepage = (long long) (base >> 12); for(i = 0; i < NPPROVMTRR; i++) { PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2); PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2); basepaget = (long long) (PPro_vmtrr[i].base >> 12); if (basepage == basepaget) skip = 1; if ((PPro_vmtrr[i].mask & 0x800) == 0) { if (free == -1) free = i; } } if (!skip && free != -1) { wbinvd(); PPro_vmtrr[free].base = base; PPro_vmtrr[free].mask = mask; wrmsr(PPRO_VMTRRphysBase0 + free * 2, base); wrmsr(PPRO_VMTRRphysMask0 + free * 2, mask); printf( "pmap: added WC mapping at page: 0x%x %x, size: %u mask: 0x%x %x\n", (u_int)(base >> 32), (u_int)base, sizea, (u_int)(mask >> 32), (u_int)mask); } } /* * Set 4mb pdir for mp startup, and global flags */ void pmap_set_opt(unsigned *pdir) { int i; if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); if (pdir4mb) { pdir[KPTDI] = pdir4mb; } } if (pgeflag && (cpu_feature & CPUID_PGE)) { load_cr4(rcr4() | CR4_PGE); for(i = KPTDI; i < KPTDI + nkpt; i++) { if (pdir[i]) { pdir[i] |= PG_G; } } } } /* * Setup the PTD for the boot processor */ void pmap_set_opt_bsp(void) { pmap_set_opt((unsigned *)kernel_pmap->pm_pdir); pmap_set_opt((unsigned *)PTD); invltlb(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t s; int i; int initial_pvs; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(pv_table_t) * pv_npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_table_t *) addr; for(i = 0; i < pv_npg; i++) { vm_offset_t pa; TAILQ_INIT(&pv_table[i].pv_list); pv_table[i].pv_list_count = 0; pa = vm_first_phys + i * PAGE_SIZE; pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa); } /* * init the pv free list */ initial_pvs = pv_npg; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg); /* * object for kernel page table pages */ kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified( vm_offset_t va) { if ((va < clean_sva) || (va >= clean_eva)) return 1; else return 0; } static PMAP_INLINE void invltlb_1pg( vm_offset_t va) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va); } } static PMAP_INLINE void invltlb_2pg( vm_offset_t va1, vm_offset_t va2) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va1); invlpg(va2); } } static unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); #if defined(SMP) /* The page directory is not shared between CPUs */ cpu_invltlb(); #else invltlb(); #endif } return (unsigned *) APTmap; } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ static unsigned * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned pde, newpf; if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; unsigned index = i386_btop(va); /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == (((unsigned) PTDpde) & PG_FRAME))) { return (unsigned *) PTmap + index; } newpf = pde & PG_FRAME; #ifdef SMP if ( ((* (unsigned *) prv_PMAP1) & PG_FRAME) != newpf) { * (unsigned *) prv_PMAP1 = newpf | PG_RW | PG_V; cpu_invlpg(&prv_PPAGE1); } return prv_PPAGE1 + ((unsigned) index & (NPTEPG - 1)); #else if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; invltlb_1pg((vm_offset_t) PADDR1); } return PADDR1 + ((unsigned) index & (NPTEPG - 1)); #endif } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t rtval; vm_offset_t pdirindex; pdirindex = va >> PDRSHIFT; if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) { unsigned *pte; if ((rtval & PG_PS) != 0) { rtval &= ~(NBPDR - 1); rtval |= va & (NBPDR - 1); return rtval; } pte = get_ptbase(pmap) + i386_btop(va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /* * determine if a page is managed (memory vs. device) */ static PMAP_INLINE int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag; unsigned opte; pte = (unsigned *)vtopte(tva); opte = *pte; *pte = npte; if (opte) invltlb_1pg(tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V | pgeflag; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; if (opte) invltlb_1pg(va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); } static vm_page_t pmap_page_lookup(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep(m, "pplookp", NULL)) goto retry; return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(p) struct proc *p; { int i, updateneeded; vm_object_t upobj; vm_page_t m; struct user *up; unsigned *ptek, oldpte; /* * allocate object for the upages */ if ((upobj = p->p_upages_obj) == NULL) { upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; } /* get a kernel virtual address for the UPAGES for this proc */ if ((up = p->p_addr) == NULL) { up = (struct user *) kmem_alloc_pageable(kernel_map, UPAGES * PAGE_SIZE); #if !defined(MAX_PERF) if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); #endif p->p_addr = up; } ptek = (unsigned *) vtopte((vm_offset_t) up); updateneeded = 0; for(i=0;iwire_count++; cnt.v_wire_count++; oldpte = *(ptek + i); /* * Enter the page into the kernel address space. */ *(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; if (oldpte) { if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386)) { invlpg((vm_offset_t) up + i * PAGE_SIZE); } else { updateneeded = 1; } } vm_page_wakeup(m); m->flags &= ~PG_ZERO; m->flags |= PG_MAPPED | PG_WRITEABLE; m->valid = VM_PAGE_BITS_ALL; } if (updateneeded) invltlb(); } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; unsigned *ptek, oldpte; upobj = p->p_upages_obj; ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr); for(i=0;iflags |= PG_BUSY; oldpte = *(ptek + i); *(ptek + i) = 0; if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386)) invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE); - vm_page_unwire(m); + vm_page_unwire(m, 0); vm_page_free(m); } if (cpu_class <= CPUCLASS_386) invltlb(); } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;idirty = VM_PAGE_BITS_ALL; - vm_page_unwire(m); - vm_page_deactivate(m); + vm_page_unwire(m, 0); pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i,rv; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;ip_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); #if !defined(MAX_PERF) if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); #endif m = vm_page_lookup(upobj, i); m->valid = VM_PAGE_BITS_ALL; } vm_page_wire(m); vm_page_wakeup(m); m->flags |= PG_MAPPED | PG_WRITEABLE; } } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { int s; while (vm_page_sleep(m, "pmuwpt", NULL)); if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); invltlb_1pg(pteva); } if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { if (m->flags & PG_WANTED) { m->flags &= ~PG_WANTED; wakeup(m); } m->flags |= PG_BUSY; vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap, va, mpte) pmap_t pmap; vm_offset_t va; vm_page_t mpte; { unsigned ptepindex; if (va >= UPT_MIN_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, mpte); } #if !defined(SMP) void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } #else void pmap_pinit0(pmap) struct pmap *pmap; { pmap_pinit(pmap); } #endif /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ retry: ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); ptdpg->wire_count = 1; ++cnt.v_wire_count; ptdpg->flags &= ~(PG_MAPPED | PG_BUSY); /* not mapped normally */ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); /* wire in kernel global address entries */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { int s; unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ if (vm_page_sleep(p, "pmaprl", NULL)) return 0; p->flags |= PG_BUSY; /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; pmap->pm_stats.resident_count--; #if !defined(MAX_PERF) if (p->hold_count) { panic("pmap_release: freeing held page table page"); } #endif /* * Page directory pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); #ifdef SMP pde[MPPTDI] = 0; #endif pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; p->wire_count--; cnt.v_wire_count--; vm_page_free_zero(p); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO | VM_ALLOC_RETRY); if (m->queue != PQ_NONE) { int s = splvm(); vm_page_unqueue(m); splx(s); } if (m->wire_count == 0) cnt.v_wire_count++; m->wire_count++; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); /* * Set the page table hint */ pmap->pm_ptphint = m; /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if ((m->flags & PG_ZERO) == 0) { if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(ptepa); } } m->valid = VM_PAGE_BITS_ALL; m->flags &= ~(PG_ZERO | PG_BUSY); m->flags |= PG_MAPPED; return m; } static vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { unsigned ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; invltlb(); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; int curgeneration; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif ptdpg = NULL; retry: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) goto retry; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; vm_offset_t ptppaddr; vm_page_t nkpg; #ifdef SMP int i; #endif pd_entry_t newpdir; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM); #if !defined(MAX_PERF) if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); #endif nkpt++; vm_page_wire(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; #ifdef SMP for (i = 0; i < mp_ncpus; i++) { if (IdlePTDS[i]) pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir; } #endif for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = newpdir; } } *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); #if !defined(MAX_PERF) panic("destroying a pmap is not yet implemented"); #endif } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv) pv_entry_t pv; { pv_entry_count--; zfreei(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloci(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { pv_table_t *ppv; int i; vm_offset_t pa; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < pv_npg; i++) { if ((ppv = &pv_table[i]) == 0) continue; m = ppv->pv_vm_page; if ((pa = VM_PAGE_TO_PHYS(m)) == 0) continue; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(pa); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap, ppv, va) struct pmap *pmap; pv_table_t *ppv; vm_offset_t va; { pv_entry_t pv; int rtval; int s; s = splvm(); if (ppv->pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; if (TAILQ_FIRST(&ppv->pv_list) == NULL) ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap, va, mpte, pa) pmap_t pmap; vm_offset_t va; vm_page_t mpte; vm_offset_t pa; { int s; pv_entry_t pv; pv_table_t *ppv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); ppv = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); ppv->pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; pv_table_t *ppv; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) invlpg(va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { ppv = pa_to_pvh(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (oldpte & PG_A) ppv->pv_vm_page->flags |= PG_REFERENCED; return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); invltlb_1pg(va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (((sva + PAGE_SIZE) == eva) && (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); if (pmap->pm_stats.resident_count == 0) break; pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) { invltlb(); } } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte, tpte; int nmodify; int update_needed; int s; nmodify = 0; update_needed = 0; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", pa); } #endif s = splvm(); ppv = pa_to_pvh(pa); while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) { pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; tpte = *pte; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) ppv->pv_vm_page->flags |= PG_REFERENCED; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (!update_needed && ((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) || (pv->pv_pmap == kernel_pmap))) { update_needed = 1; } TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); if (update_needed) invltlb(); splx(s); return; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { register unsigned *ptbase; vm_offset_t pdnxt, ptpaddr; vm_pindex_t sindex, eindex; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { (unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits; pv_table_t *ppv; pbits = ptbase[sindex]; if (pbits & PG_MANAGED) { ppv = NULL; if (pbits & PG_A) { ppv = pa_to_pvh(pbits); ppv->pv_vm_page->flags |= PG_REFERENCED; pbits &= ~PG_A; } if (pbits & PG_M) { if (pmap_track_modified(i386_ptob(sindex))) { if (ppv == NULL) ppv = pa_to_pvh(pbits); ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; pbits &= ~PG_M; } } } pbits &= ~PG_RW; if (pbits != ptbase[sindex]) { ptbase[sindex] = pbits; anychanged = 1; } } } if (anychanged) invltlb(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot, boolean_t wired) { register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va); if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } if (smp_active) { pdeaddr = (vm_offset_t *) IdlePTDS[cpuid]; if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) { if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr)) printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr); printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr); panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], newpte, origpte, va); } } } #endif pte = pmap_pte(pmap, va); #if !defined(MAX_PERF) /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n", (void *)pmap->pm_pdir[PTDPTDI], va); } #endif origpte = *(vm_offset_t *)pte; pa &= PG_FRAME; opa = origpte & PG_FRAME; #if !defined(MAX_PERF) if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); #endif /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { *pte |= PG_RW; invltlb_1pg(va); } return; } /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { pv_table_t *ppv; ppv = pa_to_pvh(opa); ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); #if !defined(MAX_PERF) if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); #endif } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte | PG_A; if (origpte) invltlb_1pg(va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap, va, pa, mpte) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_page_t mpte; { register unsigned *pte; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { unsigned ptepindex; vm_offset_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { #if !defined(MAX_PERF) if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); #endif if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); return 0; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (!pmap) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0) ) { int i; int s; vm_page_t m[1]; unsigned int ptepindex; int npdes; vm_offset_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; retry: p = vm_page_lookup(object, pindex); if (p && vm_page_sleep(p, "init4p", NULL)) goto retry; if (p == NULL) { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_free(p); return; } p = vm_page_lookup(object, pindex); vm_page_wakeup(p); } ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i=0;ipm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } p->flags |= PG_MAPPED; invltlb(); return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; vm_page_wakeup(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; vm_page_wakeup(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } m->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); m->flags |= PG_MAPPED; vm_page_wakeup(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) { return; } dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); invltlb(); } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; unsigned ptepindex; #if !defined(MAX_PERF) if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); #endif /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); ptepindex = addr >> PDRSHIFT; srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, (ptetemp & PG_FRAME)); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; dst_pte++; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { #ifdef SMP #if !defined(MAX_PERF) if (*(int *) prv_CMAP3) panic("pmap_zero_page: prv_CMAP3 busy"); #endif *(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; cpu_invlpg(&prv_CPAGE3); #if defined(I686_CPU) if (cpu == CPU_686) i686_pagezero(&prv_CPAGE3); else #endif bzero(&prv_CPAGE3, PAGE_SIZE); *(int *) prv_CMAP3 = 0; #else #if !defined(MAX_PERF) if (*(int *) CMAP2) panic("pmap_zero_page: CMAP2 busy"); #endif *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; if (cpu_class == CPUCLASS_386) { invltlb(); } else { invlpg((u_int)CADDR2); } #if defined(I686_CPU) if (cpu == CPU_686) i686_pagezero(CADDR2); else #endif bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; #endif } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { #ifdef SMP #if !defined(MAX_PERF) if (*(int *) prv_CMAP1) panic("pmap_copy_page: prv_CMAP1 busy"); if (*(int *) prv_CMAP2) panic("pmap_copy_page: prv_CMAP2 busy"); #endif *(int *) prv_CMAP1 = PG_V | (src & PG_FRAME) | PG_A; *(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M; cpu_invlpg(&prv_CPAGE1); cpu_invlpg(&prv_CPAGE2); bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE); *(int *) prv_CMAP1 = 0; *(int *) prv_CMAP2 = 0; #else #if !defined(MAX_PERF) if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); #endif *(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A; *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M; if (cpu_class == CPUCLASS_386) { invltlb(); } else { invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); } bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; #endif } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { unsigned *pte, tpte; pv_table_t *ppv; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = (unsigned *)vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; ppv = pa_to_pvh(tpte); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); ppv->pv_list_count--; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); invltlb(); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (bit & (PG_A|PG_M)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte; int changed; int s; if (!pmap_is_managed(pa)) return; s = splvm(); changed = 0; ppv = pa_to_pvh(pa); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *(int *)pte |= bit; changed = 1; } else { vm_offset_t pbits = *(vm_offset_t *)pte; if (pbits & bit) { changed = 1; if (bit == PG_RW) { if (pbits & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } *(int *)pte = pbits & ~(PG_M|PG_RW); } else { *(int *)pte = pbits & ~bit; } } } } splx(s); if (changed) invltlb(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_offset_t phys, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(phys, PG_RW, FALSE); } else { pmap_remove_all(phys); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. * */ int pmap_ts_referenced(vm_offset_t pa) { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; int rtval = 0; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { splx(s); return 0; } /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) { TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); continue; } pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte == NULL) { TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); continue; } if (*pte & PG_A) { rtval++; *pte &= ~PG_A; if (rtval > 4) { TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); break; } } TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); } splx(s); if (rtval) { invltlb(); } return (rtval); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; unsigned *pte; size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); #if !defined(MAX_PERF) if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); #endif pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } invltlb(); return ((void *) va); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; vm_page_t m; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if (pte = *ptep) { pv_table_t *ppv; vm_offset_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; ppv = pa_to_pvh((pa & PG_FRAME)); m = ppv->pv_vm_page; /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (m->dirty || pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; m->flags |= PG_REFERENCED; } } return val; } void pmap_activate(struct proc *p) { #if defined(SWTCH_OPTIM_STATS) tlb_flush_count++; #endif load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(p->p_vmspace->vm_pmap.pm_pdir)); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { pv_table_t *ppv; register pv_entry_t pv; printf("pa %x", pa); ppv = pa_to_pvh(pa); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %p, va %x, flags %x", (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/fs/procfs/procfs_mem.c =================================================================== --- head/sys/fs/procfs/procfs_mem.c (revision 40699) +++ head/sys/fs/procfs/procfs_mem.c (revision 40700) @@ -1,342 +1,342 @@ /* * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 Sean Eric Fagan * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry and Sean Eric Fagan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_mem.c 8.5 (Berkeley) 6/15/94 * - * $Id: procfs_mem.c,v 1.33 1998/06/07 17:11:57 dfr Exp $ + * $Id: procfs_mem.c,v 1.34 1998/07/15 02:32:19 bde Exp $ */ /* * This is a lightly hacked and merged version * of sef's pread/pwrite functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int procfs_rwmem __P((struct proc *curp, struct proc *p, struct uio *uio)); static int procfs_rwmem(curp, p, uio) struct proc *curp; struct proc *p; struct uio *uio; { int error; int writing; struct vmspace *vm; vm_map_t map; vm_object_t object = NULL; vm_offset_t pageno = 0; /* page number */ vm_prot_t reqprot; vm_offset_t kva; /* * if the vmspace is in the midst of being deallocated or the * process is exiting, don't try to grab anything. The page table * usage in that process can be messed up. */ vm = p->p_vmspace; if ((p->p_flag & P_WEXIT) || (vm->vm_refcnt < 1)) return EFAULT; ++vm->vm_refcnt; /* * The map we want... */ map = &vm->vm_map; writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) : VM_PROT_READ; kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_map_t tmap; vm_offset_t uva; int page_offset; /* offset into page */ vm_map_entry_t out_entry; vm_prot_t out_prot; boolean_t wired; vm_pindex_t pindex; u_int len; vm_page_t m; object = NULL; uva = (vm_offset_t) uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); if (uva >= VM_MAXUSER_ADDRESS) { vm_offset_t tkva; if (writing || uva >= VM_MAXUSER_ADDRESS + UPAGES * PAGE_SIZE || (ptrace_read_u_check(p, uva - (vm_offset_t) VM_MAXUSER_ADDRESS, (size_t) len) && !procfs_kmemaccess(curp))) { error = 0; break; } /* we are reading the "U area", force it into core */ PHOLD(p); /* sanity check */ if (!(p->p_flag & P_INMEM)) { /* aiee! */ PRELE(p); error = EFAULT; break; } /* populate the ptrace/procfs area */ p->p_addr->u_kproc.kp_proc = *p; fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); /* locate the in-core address */ tkva = (uintptr_t)p->p_addr + uva - VM_MAXUSER_ADDRESS; /* transfer it */ error = uiomove((caddr_t)tkva, len, uio); /* let the pages go */ PRELE(p); continue; } /* * Fault the page on behalf of the process */ error = vm_fault(map, pageno, reqprot, FALSE); if (error) { error = EFAULT; break; } /* * Now we need to get the page. out_entry, out_prot, wired, * and single_use aren't used. One would think the vm code * would be a *bit* nicer... We use tmap because * vm_map_lookup() can change the map argument. */ tmap = map; error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, &object, &pindex, &out_prot, &wired); if (error) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; break; } m = vm_page_lookup(object, pindex); /* Allow fallback to backing objects if we are reading */ while (m == NULL && !writing && object->backing_object) { pindex += OFF_TO_IDX(object->backing_object_offset); object = object->backing_object; m = vm_page_lookup(object, pindex); } if (m == NULL) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; vm_map_lookup_done(tmap, out_entry); break; } /* * Wire the page into memory */ vm_page_wire(m); /* * We're done with tmap now. * But reference the object first, so that we won't loose * it. */ vm_object_reference(object); vm_map_lookup_done(tmap, out_entry); pmap_kenter(kva, VM_PAGE_TO_PHYS(m)); /* * Now do the i/o move. */ error = uiomove((caddr_t)(kva + page_offset), len, uio); pmap_kremove(kva); /* * release the page and the object */ - vm_page_unwire(m); + vm_page_unwire(m, 1); vm_object_deallocate(object); object = NULL; } while (error == 0 && uio->uio_resid > 0); if (object) vm_object_deallocate(object); kmem_free(kernel_map, kva, PAGE_SIZE); vmspace_free(vm); return (error); } /* * Copy data in and out of the target process. * We do this by mapping the process's page into * the kernel and then doing a uiomove direct * from the kernel address space. */ int procfs_domem(curp, p, pfs, uio) struct proc *curp; struct proc *p; struct pfsnode *pfs; struct uio *uio; { if (uio->uio_resid == 0) return (0); /* * XXX * We need to check for KMEM_GROUP because ps is sgid kmem; * not allowing it here causes ps to not work properly. Arguably, * this is a bug with what ps does. We only need to do this * for Pmem nodes, and only if it's reading. This is still not * good, as it may still be possible to grab illicit data if * a process somehow gets to be KMEM_GROUP. Note that this also * means that KMEM_GROUP can't change without editing procfs.h! * All in all, quite yucky. */ if (!CHECKIO(curp, p) && !(uio->uio_rw == UIO_READ && procfs_kmemaccess(curp))) return EPERM; return (procfs_rwmem(curp, p, uio)); } /* * Given process (p), find the vnode from which * its text segment is being executed. * * It would be nice to grab this information from * the VM system, however, there is no sure-fire * way of doing that. Instead, fork(), exec() and * wait() all maintain the p_textvp field in the * process proc structure which contains a held * reference to the exec'ed vnode. */ struct vnode * procfs_findtextvp(p) struct proc *p; { return (p->p_textvp); } int procfs_kmemaccess(curp) struct proc *curp; { int i; struct ucred *cred; cred = curp->p_cred->pc_ucred; if (suser(cred, &curp->p_acflag)) return 1; for (i = 0; i < cred->cr_ngroups; i++) if (cred->cr_groups[i] == KMEM_GROUP) return 1; return 0; } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 40699) +++ head/sys/i386/i386/pmap.c (revision 40700) @@ -1,3545 +1,3544 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.209 1998/09/06 23:04:20 tegge Exp $ + * $Id: pmap.c,v 1.210 1998/10/21 11:38:14 dg Exp $ */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_disable_pse.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; #define pa_index(pa) atop((pa) - vm_first_phys) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) static struct pmap kernel_pmap_store; pmap_t kernel_pmap; extern pd_entry_t my_idlePTD; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int pgeflag; /* PG_G or-in */ static int pseflag; /* PG_PS or-in */ static int pv_npg; static vm_object_t kptobj; static int nkpt; vm_offset_t kernel_vm_end; /* * Data for the pv entry allocation mechanism */ static vm_zone_t pvzone; static struct vm_zone pvzone_store; static struct vm_object pvzone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *ptmmap; static pv_table_t *pv_table; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp=0; #ifdef SMP extern char prv_CPAGE1[], prv_CPAGE2[], prv_CPAGE3[]; extern pt_entry_t *prv_CMAP1, *prv_CMAP2, *prv_CMAP3; extern pd_entry_t *IdlePTDS[]; extern pt_entry_t SMP_prvpt[]; #endif #ifdef SMP extern unsigned int prv_PPAGE1[]; extern pt_entry_t *prv_PMAP1; #else static pt_entry_t *PMAP1 = 0; static unsigned *PADDR1 = 0; #endif static PMAP_INLINE void free_pv_entry __P((pv_entry_t pv)); static unsigned * get_ptbase __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static PMAP_INLINE int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static vm_page_t pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_page_t mpte)); static int pmap_remove_pte __P((struct pmap *pmap, unsigned *ptq, vm_offset_t sva)); static void pmap_remove_page __P((struct pmap *pmap, vm_offset_t va)); static int pmap_remove_entry __P((struct pmap *pmap, pv_table_t *pv, vm_offset_t va)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); static void pmap_insert_entry __P((pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_offset_t pa)); static vm_page_t pmap_allocpte __P((pmap_t pmap, vm_offset_t va)); static int pmap_release_free_page __P((pmap_t pmap, vm_page_t p)); static vm_page_t _pmap_allocpte __P((pmap_t pmap, unsigned ptepindex)); static unsigned * pmap_pte_quick __P((pmap_t pmap, vm_offset_t va)); static vm_page_t pmap_page_lookup __P((vm_object_t object, vm_pindex_t pindex)); static int pmap_unuse_pt __P((pmap_t, vm_offset_t, vm_page_t)); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); void pmap_collect(void); static unsigned pdir4mb; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ PMAP_INLINE unsigned * pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned *pdeaddr; if (pmap) { pdeaddr = (unsigned *) pmap_pde(pmap, va); if (*pdeaddr & PG_PS) return pdeaddr; if (*pdeaddr) { return get_ptbase(pmap) + i386_btop(va); } } return (0); } /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) { newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); } #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i, j; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); kernel_pmap->pm_count = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = (pt_entry_t *) pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(MSGBUF_SIZE))) #if !defined(SMP) /* * ptemap is used for pmap_pte_quick */ SYSMAP(unsigned *, PMAP1, PADDR1, 1); #endif virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = 0; *(int *) PTD = 0; pgeflag = 0; #if !defined(SMP) if (cpu_feature & CPUID_PGE) { pgeflag = PG_G; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #if !defined(DISABLE_PSE) if (cpu_feature & CPUID_PSE) { unsigned ptditmp; /* * Enable the PSE mode */ load_cr4(rcr4() | CR4_PSE); /* * Note that we have enabled PSE mode */ pseflag = PG_PS; ptditmp = *((unsigned *)PTmap + i386_btop(KERNBASE)); ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; /* * We can do the mapping here for the single processor * case. We simply ignore the old page table page from * now on. */ #if !defined(SMP) PTD[KPTDI] = (pd_entry_t) ptditmp; kernel_pmap->pm_pdir[KPTDI] = (pd_entry_t) ptditmp; invltlb(); #endif } #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic!"); /* 0 = private page */ /* 1 = page table page */ /* 2 = local apic */ /* 16-31 = io apics */ SMP_prvpt[2] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (cpu_apic_address & PG_FRAME)); for (i = 0; i < mp_napics; i++) { for (j = 0; j < 16; j++) { /* same page frame as a previous IO apic? */ if (((vm_offset_t)SMP_prvpt[j + 16] & PG_FRAME) == (io_apic_address[0] & PG_FRAME)) { ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } /* use this slot if available */ if (((vm_offset_t)SMP_prvpt[j + 16] & PG_FRAME) == 0) { SMP_prvpt[j + 16] = (pt_entry_t)(PG_V | PG_RW | pgeflag | (io_apic_address[i] & PG_FRAME)); ioapic[i] = (ioapic_t *)&SMP_ioapic[j * PAGE_SIZE]; break; } } if (j == 16) panic("no space to map IO apic %d!", i); } /* BSP does this itself, AP's get it pre-set */ prv_CMAP1 = &SMP_prvpt[3 + UPAGES]; prv_CMAP2 = &SMP_prvpt[4 + UPAGES]; prv_CMAP3 = &SMP_prvpt[5 + UPAGES]; prv_PMAP1 = &SMP_prvpt[6 + UPAGES]; #endif invltlb(); } void getmtrr() { int i; if (cpu == CPU_686) { for(i = 0; i < NPPROVMTRR; i++) { PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2); PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2); } } } void putmtrr() { int i; if (cpu == CPU_686) { wbinvd(); for(i = 0; i < NPPROVMTRR; i++) { wrmsr(PPRO_VMTRRphysBase0 + i * 2, PPro_vmtrr[i].base); wrmsr(PPRO_VMTRRphysMask0 + i * 2, PPro_vmtrr[i].mask); } } } void pmap_setvidram(void) { #if 0 if (cpu == CPU_686) { wbinvd(); /* * Set memory between 0-640K to be WB */ wrmsr(0x250, 0x0606060606060606LL); wrmsr(0x258, 0x0606060606060606LL); /* * Set normal, PC video memory to be WC */ wrmsr(0x259, 0x0101010101010101LL); } #endif } void pmap_setdevram(unsigned long long basea, vm_offset_t sizea) { int i, free, skip; unsigned basepage, basepaget; unsigned long long base; unsigned long long mask; if (cpu != CPU_686) return; free = -1; skip = 0; basea &= ~0xfff; base = basea | 0x1; mask = (long long) (0xfffffffffLL - ((long) sizea - 1)) | (long long) 0x800; mask &= ~0x7ff; basepage = (long long) (base >> 12); for(i = 0; i < NPPROVMTRR; i++) { PPro_vmtrr[i].base = rdmsr(PPRO_VMTRRphysBase0 + i * 2); PPro_vmtrr[i].mask = rdmsr(PPRO_VMTRRphysMask0 + i * 2); basepaget = (long long) (PPro_vmtrr[i].base >> 12); if (basepage == basepaget) skip = 1; if ((PPro_vmtrr[i].mask & 0x800) == 0) { if (free == -1) free = i; } } if (!skip && free != -1) { wbinvd(); PPro_vmtrr[free].base = base; PPro_vmtrr[free].mask = mask; wrmsr(PPRO_VMTRRphysBase0 + free * 2, base); wrmsr(PPRO_VMTRRphysMask0 + free * 2, mask); printf( "pmap: added WC mapping at page: 0x%x %x, size: %u mask: 0x%x %x\n", (u_int)(base >> 32), (u_int)base, sizea, (u_int)(mask >> 32), (u_int)mask); } } /* * Set 4mb pdir for mp startup, and global flags */ void pmap_set_opt(unsigned *pdir) { int i; if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); if (pdir4mb) { pdir[KPTDI] = pdir4mb; } } if (pgeflag && (cpu_feature & CPUID_PGE)) { load_cr4(rcr4() | CR4_PGE); for(i = KPTDI; i < KPTDI + nkpt; i++) { if (pdir[i]) { pdir[i] |= PG_G; } } } } /* * Setup the PTD for the boot processor */ void pmap_set_opt_bsp(void) { pmap_set_opt((unsigned *)kernel_pmap->pm_pdir); pmap_set_opt((unsigned *)PTD); invltlb(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t s; int i; int initial_pvs; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); pv_npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(pv_table_t) * pv_npg); s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_table_t *) addr; for(i = 0; i < pv_npg; i++) { vm_offset_t pa; TAILQ_INIT(&pv_table[i].pv_list); pv_table[i].pv_list_count = 0; pa = vm_first_phys + i * PAGE_SIZE; pv_table[i].pv_vm_page = PHYS_TO_VM_PAGE(pa); } /* * init the pv free list */ initial_pvs = pv_npg; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = &pvzone_store; pvinit = (struct pv_entry *) kmem_alloc(kernel_map, initial_pvs * sizeof (struct pv_entry)); zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, pv_npg); /* * object for kernel page table pages */ kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { pv_entry_max = PMAP_SHPGPERPROC * maxproc + pv_npg; pv_entry_high_water = 9 * (pv_entry_max / 10); zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified( vm_offset_t va) { if ((va < clean_sva) || (va >= clean_eva)) return 1; else return 0; } static PMAP_INLINE void invltlb_1pg( vm_offset_t va) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va); } } static PMAP_INLINE void invltlb_2pg( vm_offset_t va1, vm_offset_t va2) { #if defined(I386_CPU) if (cpu_class == CPUCLASS_386) { invltlb(); } else #endif { invlpg(va1); invlpg(va2); } } static unsigned * get_ptbase(pmap) pmap_t pmap; { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == (((unsigned) PTDpde) & PG_FRAME)) { return (unsigned *) PTmap; } /* otherwise, we are alternate address space */ if (frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (frame | PG_RW | PG_V); #if defined(SMP) /* The page directory is not shared between CPUs */ cpu_invltlb(); #else invltlb(); #endif } return (unsigned *) APTmap; } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ static unsigned * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { unsigned pde, newpf; if (pde = (unsigned) pmap->pm_pdir[va >> PDRSHIFT]) { unsigned frame = (unsigned) pmap->pm_pdir[PTDPTDI] & PG_FRAME; unsigned index = i386_btop(va); /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == (((unsigned) PTDpde) & PG_FRAME))) { return (unsigned *) PTmap + index; } newpf = pde & PG_FRAME; #ifdef SMP if ( ((* (unsigned *) prv_PMAP1) & PG_FRAME) != newpf) { * (unsigned *) prv_PMAP1 = newpf | PG_RW | PG_V; cpu_invlpg(&prv_PPAGE1); } return prv_PPAGE1 + ((unsigned) index & (NPTEPG - 1)); #else if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; invltlb_1pg((vm_offset_t) PADDR1); } return PADDR1 + ((unsigned) index & (NPTEPG - 1)); #endif } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t rtval; vm_offset_t pdirindex; pdirindex = va >> PDRSHIFT; if (pmap && (rtval = (unsigned) pmap->pm_pdir[pdirindex])) { unsigned *pte; if ((rtval & PG_PS) != 0) { rtval &= ~(NBPDR - 1); rtval |= va & (NBPDR - 1); return rtval; } pte = get_ptbase(pmap) + i386_btop(va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /* * determine if a page is managed (memory vs. device) */ static PMAP_INLINE int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa < phys_avail[i + 1] && pa >= phys_avail[i]) return 1; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; unsigned npte = VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | pgeflag; unsigned opte; pte = (unsigned *)vtopte(tva); opte = *pte; *pte = npte; if (opte) invltlb_1pg(tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register unsigned *pte; for (i = 0; i < count; i++) { pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register unsigned *pte; unsigned npte, opte; npte = pa | PG_RW | PG_V | pgeflag; pte = (unsigned *)vtopte(va); opte = *pte; *pte = npte; if (opte) invltlb_1pg(va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(va) vm_offset_t va; { register unsigned *pte; pte = (unsigned *)vtopte(va); *pte = 0; invltlb_1pg(va); } static vm_page_t pmap_page_lookup(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m && vm_page_sleep(m, "pplookp", NULL)) goto retry; return m; } /* * Create the UPAGES for a new process. * This routine directly affects the fork perf for a process. */ void pmap_new_proc(p) struct proc *p; { int i, updateneeded; vm_object_t upobj; vm_page_t m; struct user *up; unsigned *ptek, oldpte; /* * allocate object for the upages */ if ((upobj = p->p_upages_obj) == NULL) { upobj = vm_object_allocate( OBJT_DEFAULT, UPAGES); p->p_upages_obj = upobj; } /* get a kernel virtual address for the UPAGES for this proc */ if ((up = p->p_addr) == NULL) { up = (struct user *) kmem_alloc_pageable(kernel_map, UPAGES * PAGE_SIZE); #if !defined(MAX_PERF) if (up == NULL) panic("pmap_new_proc: u_map allocation failed"); #endif p->p_addr = up; } ptek = (unsigned *) vtopte((vm_offset_t) up); updateneeded = 0; for(i=0;iwire_count++; cnt.v_wire_count++; oldpte = *(ptek + i); /* * Enter the page into the kernel address space. */ *(ptek + i) = VM_PAGE_TO_PHYS(m) | PG_RW | PG_V | pgeflag; if (oldpte) { if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386)) { invlpg((vm_offset_t) up + i * PAGE_SIZE); } else { updateneeded = 1; } } vm_page_wakeup(m); m->flags &= ~PG_ZERO; m->flags |= PG_MAPPED | PG_WRITEABLE; m->valid = VM_PAGE_BITS_ALL; } if (updateneeded) invltlb(); } /* * Dispose the UPAGES for a process that has exited. * This routine directly impacts the exit perf of a process. */ void pmap_dispose_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; unsigned *ptek, oldpte; upobj = p->p_upages_obj; ptek = (unsigned *) vtopte((vm_offset_t) p->p_addr); for(i=0;iflags |= PG_BUSY; oldpte = *(ptek + i); *(ptek + i) = 0; if ((oldpte & PG_G) || (cpu_class > CPUCLASS_386)) invlpg((vm_offset_t) p->p_addr + i * PAGE_SIZE); - vm_page_unwire(m); + vm_page_unwire(m, 0); vm_page_free(m); } if (cpu_class <= CPUCLASS_386) invltlb(); } /* * Allow the UPAGES for a process to be prejudicially paged out. */ void pmap_swapout_proc(p) struct proc *p; { int i; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; /* * let the upages be paged */ for(i=0;idirty = VM_PAGE_BITS_ALL; - vm_page_unwire(m); - vm_page_deactivate(m); + vm_page_unwire(m, 0); pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i); } } /* * Bring the UPAGES for a specified process back in. */ void pmap_swapin_proc(p) struct proc *p; { int i,rv; vm_object_t upobj; vm_page_t m; upobj = p->p_upages_obj; for(i=0;ip_addr) + i * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(upobj, &m, 1, 0); #if !defined(MAX_PERF) if (rv != VM_PAGER_OK) panic("pmap_swapin_proc: cannot get upages for proc: %d\n", p->p_pid); #endif m = vm_page_lookup(upobj, i); m->valid = VM_PAGE_BITS_ALL; } vm_page_wire(m); vm_page_wakeup(m); m->flags |= PG_MAPPED | PG_WRITEABLE; } } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { int s; while (vm_page_sleep(m, "pmuwpt", NULL)); if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pteva = UPT_MIN_ADDRESS + i386_ptob(m->pindex); invltlb_1pg(pteva); } if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { if (m->flags & PG_WANTED) { m->flags &= ~PG_WANTED; wakeup(m); } m->flags |= PG_BUSY; vm_page_free_zero(m); --cnt.v_wire_count; } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap, va, mpte) pmap_t pmap; vm_offset_t va; vm_page_t mpte; { unsigned ptepindex; if (va >= UPT_MIN_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, mpte); } #if !defined(SMP) void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); pmap_kenter((vm_offset_t) pmap->pm_pdir, (vm_offset_t) IdlePTD); pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } #else void pmap_pinit0(pmap) struct pmap *pmap; { pmap_pinit(pmap); } #endif /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate( OBJT_DEFAULT, PTDPTDI + 1); /* * allocate the page directory page */ retry: ptdpg = vm_page_grab( pmap->pm_pteobj, PTDPTDI, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); ptdpg->wire_count = 1; ++cnt.v_wire_count; ptdpg->flags &= ~(PG_MAPPED | PG_BUSY); /* not mapped normally */ ptdpg->valid = VM_PAGE_BITS_ALL; pmap_kenter((vm_offset_t) pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); if ((ptdpg->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir, PAGE_SIZE); /* wire in kernel global address entries */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(unsigned *) (pmap->pm_pdir + PTDPTDI) = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } static int pmap_release_free_page(pmap, p) struct pmap *pmap; vm_page_t p; { int s; unsigned *pde = (unsigned *) pmap->pm_pdir; /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ if (vm_page_sleep(p, "pmaprl", NULL)) return 0; p->flags |= PG_BUSY; /* * Remove the page table page from the processes address space. */ pde[p->pindex] = 0; pmap->pm_stats.resident_count--; #if !defined(MAX_PERF) if (p->hold_count) { panic("pmap_release: freeing held page table page"); } #endif /* * Page directory pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == PTDPTDI) { bzero(pde + KPTDI, nkpt * PTESIZE); #ifdef SMP pde[MPPTDI] = 0; #endif pde[APTDPTDI] = 0; pmap_kremove((vm_offset_t) pmap->pm_pdir); } if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) pmap->pm_ptphint = NULL; p->wire_count--; cnt.v_wire_count--; vm_page_free_zero(p); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_offset_t pteva, ptepa; vm_page_t m; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_ZERO | VM_ALLOC_RETRY); if (m->queue != PQ_NONE) { int s = splvm(); vm_page_unqueue(m); splx(s); } if (m->wire_count == 0) cnt.v_wire_count++; m->wire_count++; /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); /* * Set the page table hint */ pmap->pm_ptphint = m; /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if ((m->flags & PG_ZERO) == 0) { if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == (((unsigned) PTDpde) & PG_FRAME)) { pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(ptepa); } } m->valid = VM_PAGE_BITS_ALL; m->flags &= ~(PG_ZERO | PG_BUSY); m->flags |= PG_MAPPED; return m; } static vm_page_t pmap_allocpte(pmap, va) pmap_t pmap; vm_offset_t va; { unsigned ptepindex; vm_offset_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; invltlb(); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = m; } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_page_t p,n,ptdpg; vm_object_t object = pmap->pm_pteobj; int curgeneration; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif ptdpg = NULL; retry: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex == PTDPTDI) { ptdpg = p; continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } if (ptdpg && !pmap_release_free_page(pmap, ptdpg)) goto retry; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; vm_offset_t ptppaddr; vm_page_t nkpg; #ifdef SMP int i; #endif pd_entry_t newpdir; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM); #if !defined(MAX_PERF) if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); #endif nkpt++; vm_page_wire(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); pmap_zero_page(ptppaddr); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; #ifdef SMP for (i = 0; i < mp_ncpus; i++) { if (IdlePTDS[i]) pdir_pde(IdlePTDS[i], kernel_vm_end) = newpdir; } #endif for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = newpdir; } } *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); #if !defined(MAX_PERF) panic("destroying a pmap is not yet implemented"); #endif } } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv) pv_entry_t pv; { pv_entry_count--; zfreei(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return zalloci(pvzone); } /* * This routine is very drastic, but can save the system * in a pinch. */ void pmap_collect() { pv_table_t *ppv; int i; vm_offset_t pa; vm_page_t m; static int warningdone=0; if (pmap_pagedaemon_waken == 0) return; if (warningdone < 5) { printf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); warningdone++; } for(i = 0; i < pv_npg; i++) { if ((ppv = &pv_table[i]) == 0) continue; m = ppv->pv_vm_page; if ((pa = VM_PAGE_TO_PHYS(m)) == 0) continue; if (m->wire_count || m->hold_count || m->busy || (m->flags & PG_BUSY)) continue; pmap_remove_all(pa); } pmap_pagedaemon_waken = 0; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap, ppv, va) struct pmap *pmap; pv_table_t *ppv; vm_offset_t va; { pv_entry_t pv; int rtval; int s; s = splvm(); if (ppv->pv_list_count < pmap->pm_stats.resident_count) { for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = TAILQ_NEXT(pv, pv_plist)) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; if (TAILQ_FIRST(&ppv->pv_list) == NULL) ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap, va, mpte, pa) pmap_t pmap; vm_offset_t va; vm_page_t mpte; vm_offset_t pa; { int s; pv_entry_t pv; pv_table_t *ppv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); ppv = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); ppv->pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap, ptq, va) struct pmap *pmap; unsigned *ptq; vm_offset_t va; { unsigned oldpte; pv_table_t *ppv; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) invlpg(va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { ppv = pa_to_pvh(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (oldpte & PG_A) ppv->pv_vm_page->flags |= PG_REFERENCED; return pmap_remove_entry(pmap, ppv, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap, va) struct pmap *pmap; register vm_offset_t va; { register unsigned *ptq; /* * if there is no pte for this address, just skip it!!! */ if (*pmap_pde(pmap, va) == 0) { return; } /* * get a local va for mappings for this pmap. */ ptq = get_ptbase(pmap) + i386_btop(va); if (*ptq) { (void) pmap_remove_pte(pmap, ptq, va); invltlb_1pg(va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register unsigned *ptbase; vm_offset_t pdnxt; vm_offset_t ptpaddr; vm_offset_t sindex, eindex; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (((sva + PAGE_SIZE) == eva) && (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; /* * Get a local virtual address for the mappings that are being * worked with. */ ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); if (pmap->pm_stats.resident_count == 0) break; pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eindex) { pdnxt = eindex; } for ( ;sindex != pdnxt; sindex++) { vm_offset_t va; if (ptbase[sindex] == 0) { continue; } va = i386_ptob(sindex); anyvalid++; if (pmap_remove_pte(pmap, ptbase + sindex, va)) break; } } if (anyvalid) { invltlb(); } } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte, tpte; int nmodify; int update_needed; int s; nmodify = 0; update_needed = 0; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", pa); } #endif s = splvm(); ppv = pa_to_pvh(pa); while ((pv = TAILQ_FIRST(&ppv->pv_list)) != NULL) { pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; tpte = *pte; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) ppv->pv_vm_page->flags |= PG_REFERENCED; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } if (!update_needed && ((!curproc || (&curproc->p_vmspace->vm_pmap == pv->pv_pmap)) || (pv->pv_pmap == kernel_pmap))) { update_needed = 1; } TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); ppv->pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); if (update_needed) invltlb(); splx(s); return; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { register unsigned *ptbase; vm_offset_t pdnxt, ptpaddr; vm_pindex_t sindex, eindex; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; ptbase = get_ptbase(pmap); sindex = i386_btop(sva); eindex = i386_btop(eva); for (; sindex < eindex; sindex = pdnxt) { unsigned pdirindex; pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1)); pdirindex = sindex / NPDEPG; if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) { (unsigned) pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged++; continue; } /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; if (pdnxt > eindex) { pdnxt = eindex; } for (; sindex != pdnxt; sindex++) { unsigned pbits; pv_table_t *ppv; pbits = ptbase[sindex]; if (pbits & PG_MANAGED) { ppv = NULL; if (pbits & PG_A) { ppv = pa_to_pvh(pbits); ppv->pv_vm_page->flags |= PG_REFERENCED; pbits &= ~PG_A; } if (pbits & PG_M) { if (pmap_track_modified(i386_ptob(sindex))) { if (ppv == NULL) ppv = pa_to_pvh(pbits); ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; pbits &= ~PG_M; } } } pbits &= ~PG_RW; if (pbits != ptbase[sindex]) { ptbase[sindex] = pbits; anychanged = 1; } } } if (anychanged) invltlb(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, vm_prot_t prot, boolean_t wired) { register unsigned *pte; vm_offset_t opa; vm_offset_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { vm_offset_t *pdeaddr = (vm_offset_t *)pmap_pde(pmap, va); if (((origpte = (vm_offset_t) *pdeaddr) & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page(0), pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } if (smp_active) { pdeaddr = (vm_offset_t *) IdlePTDS[cpuid]; if (((newpte = pdeaddr[va >> PDRSHIFT]) & PG_V) == 0) { if ((vm_offset_t) my_idlePTD != (vm_offset_t) vtophys(pdeaddr)) printf("pde mismatch: %x, %x\n", my_idlePTD, pdeaddr); printf("cpuid: %d, pdeaddr: 0x%x\n", cpuid, pdeaddr); panic("pmap_enter: invalid kernel page table page(1), pdir=%p, npde=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], newpte, origpte, va); } } } #endif pte = pmap_pte(pmap, va); #if !defined(MAX_PERF) /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory, pdir=%p, va=0x%x\n", (void *)pmap->pm_pdir[PTDPTDI], va); } #endif origpte = *(vm_offset_t *)pte; pa &= PG_FRAME; opa = origpte & PG_FRAME; #if !defined(MAX_PERF) if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); #endif /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { *pte |= PG_RW; invltlb_1pg(va); } return; } /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { pv_table_t *ppv; ppv = pa_to_pvh(opa); ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; err = pmap_remove_pte(pmap, pte, va); #if !defined(MAX_PERF) if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); #endif } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { pmap_insert_entry(pmap, va, mpte, pa); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < UPT_MIN_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { *pte = newpte | PG_A; if (origpte) invltlb_1pg(va); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap, va, pa, mpte) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_page_t mpte; { register unsigned *pte; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < UPT_MIN_ADDRESS) { unsigned ptepindex; vm_offset_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { #if !defined(MAX_PERF) if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); #endif if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); pmap->pm_ptphint = mpte; } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = (unsigned *)vtopte(va); if (*pte) { if (mpte) pmap_unwire_pte_hold(pmap, mpte); return 0; } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pmap_insert_entry(pmap, va, mpte, pa); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pa | PG_V | PG_U | PG_MANAGED; return mpte; } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size, limit) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; int limit; { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; int objpgs; if (!pmap) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0) ) { int i; int s; vm_page_t m[1]; unsigned int ptepindex; int npdes; vm_offset_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; retry: p = vm_page_lookup(object, pindex); if (p && vm_page_sleep(p, "init4p", NULL)) goto retry; if (p == NULL) { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_free(p); return; } p = vm_page_lookup(object, pindex); vm_page_wakeup(p); } ptepa = (vm_offset_t) VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i=0;ipm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } p->flags |= PG_MAPPED; invltlb(); return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || (limit && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) psize = object->size - pindex; mpte = NULL; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = TAILQ_FIRST(&object->memq); ((objpgs > 0) && (p != NULL)); p = TAILQ_NEXT(p, listq)) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; vm_page_wakeup(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); if (p && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); p->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), VM_PAGE_TO_PHYS(p), mpte); p->flags |= PG_MAPPED; vm_page_wakeup(p); } } } return; } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -PAGE_SIZE, PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; unsigned *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == NULL) continue; pte = (unsigned *) vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } m->flags |= PG_BUSY; mpte = pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m), mpte); m->flags |= PG_MAPPED; vm_page_wakeup(m); } } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register unsigned *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; unsigned src_frame, dst_frame; if (dst_addr != src_addr) return; src_frame = ((unsigned) src_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (src_frame != (((unsigned) PTDpde) & PG_FRAME)) { return; } dst_frame = ((unsigned) dst_pmap->pm_pdir[PTDPTDI]) & PG_FRAME; if (dst_frame != (((unsigned) APTDpde) & PG_FRAME)) { APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); invltlb(); } for(addr = src_addr; addr < end_addr; addr = pdnxt) { unsigned *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; vm_offset_t srcptepaddr; unsigned ptepindex; #if !defined(MAX_PERF) if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); #endif /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); ptepindex = addr >> PDRSHIFT; srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = (unsigned *) vtopte(addr); dst_pte = (unsigned *) avtopte(addr); while (addr < pdnxt) { unsigned ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, (ptetemp & PG_FRAME)); } else { pmap_unwire_pte_hold(dst_pmap, dstmpte); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; dst_pte++; } } } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { #ifdef SMP #if !defined(MAX_PERF) if (*(int *) prv_CMAP3) panic("pmap_zero_page: prv_CMAP3 busy"); #endif *(int *) prv_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; cpu_invlpg(&prv_CPAGE3); #if defined(I686_CPU) if (cpu == CPU_686) i686_pagezero(&prv_CPAGE3); else #endif bzero(&prv_CPAGE3, PAGE_SIZE); *(int *) prv_CMAP3 = 0; #else #if !defined(MAX_PERF) if (*(int *) CMAP2) panic("pmap_zero_page: CMAP2 busy"); #endif *(int *) CMAP2 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M; if (cpu_class == CPUCLASS_386) { invltlb(); } else { invlpg((u_int)CADDR2); } #if defined(I686_CPU) if (cpu == CPU_686) i686_pagezero(CADDR2); else #endif bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; #endif } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { #ifdef SMP #if !defined(MAX_PERF) if (*(int *) prv_CMAP1) panic("pmap_copy_page: prv_CMAP1 busy"); if (*(int *) prv_CMAP2) panic("pmap_copy_page: prv_CMAP2 busy"); #endif *(int *) prv_CMAP1 = PG_V | (src & PG_FRAME) | PG_A; *(int *) prv_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M; cpu_invlpg(&prv_CPAGE1); cpu_invlpg(&prv_CPAGE2); bcopy(&prv_CPAGE1, &prv_CPAGE2, PAGE_SIZE); *(int *) prv_CMAP1 = 0; *(int *) prv_CMAP2 = 0; #else #if !defined(MAX_PERF) if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); #endif *(int *) CMAP1 = PG_V | (src & PG_FRAME) | PG_A; *(int *) CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M; if (cpu_class == CPUCLASS_386) { invltlb(); } else { invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); } bcopy(CADDR1, CADDR2, PAGE_SIZE); *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; #endif } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; pv_table_t *ppv; int s; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { unsigned *pte, tpte; pv_table_t *ppv; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif s = splvm(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = (unsigned *)vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; ppv = pa_to_pvh(tpte); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); ppv->pv_list_count--; TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { ppv->pv_vm_page->flags &= ~(PG_MAPPED | PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); invltlb(); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; if (!pmap_is_managed(pa)) return FALSE; ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) return FALSE; s = splvm(); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (bit & (PG_A|PG_M)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & bit) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; pv_table_t *ppv; register unsigned *pte; int changed; int s; if (!pmap_is_managed(pa)) return; s = splvm(); changed = 0; ppv = pa_to_pvh(pa); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *(int *)pte |= bit; changed = 1; } else { vm_offset_t pbits = *(vm_offset_t *)pte; if (pbits & bit) { changed = 1; if (bit == PG_RW) { if (pbits & PG_M) { ppv->pv_vm_page->dirty = VM_PAGE_BITS_ALL; } *(int *)pte = pbits & ~(PG_M|PG_RW); } else { *(int *)pte = pbits & ~bit; } } } } splx(s); if (changed) invltlb(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_offset_t phys, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(phys, PG_RW, FALSE); } else { pmap_remove_all(phys); } } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. * */ int pmap_ts_referenced(vm_offset_t pa) { register pv_entry_t pv; pv_table_t *ppv; unsigned *pte; int s; int rtval = 0; if (!pmap_is_managed(pa)) return FALSE; s = splvm(); ppv = pa_to_pvh(pa); if (TAILQ_FIRST(&ppv->pv_list) == NULL) { splx(s); return 0; } /* * Not found, check current mappings returning immediately if found. */ for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { TAILQ_REMOVE(&ppv->pv_list, pv, pv_list); /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) { TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); continue; } pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte == NULL) { TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); continue; } if (*pte & PG_A) { rtval++; *pte &= ~PG_A; if (rtval > 4) { TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); break; } } TAILQ_INSERT_TAIL(&ppv->pv_list, pv, pv_list); } splx(s); if (rtval) { invltlb(); } return (rtval); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; unsigned *pte; size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); #if !defined(MAX_PERF) if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); #endif pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = (unsigned *)vtopte(tmpva); *pte = pa | PG_RW | PG_V | pgeflag; size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } invltlb(); return ((void *) va); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { unsigned *ptep, pte; vm_page_t m; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if (pte = *ptep) { pv_table_t *ppv; vm_offset_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; ppv = pa_to_pvh((pa & PG_FRAME)); m = ppv->pv_vm_page; /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; /* * Modified by someone */ else if (m->dirty || pmap_is_modified(pa)) val |= MINCORE_MODIFIED_OTHER; /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; /* * Referenced by someone */ else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(pa)) { val |= MINCORE_REFERENCED_OTHER; m->flags |= PG_REFERENCED; } } return val; } void pmap_activate(struct proc *p) { #if defined(SWTCH_OPTIM_STATS) tlb_flush_count++; #endif load_cr3(p->p_addr->u_pcb.pcb_cr3 = vtophys(p->p_vmspace->vm_pmap.pm_pdir)); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; for (p = allproc.lh_first; p != NULL; p = p->p_list.le_next) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = &p->p_vmspace->vm_pmap; for(i=0;i<1024;i++) { pd_entry_t *pde; unsigned *pte; unsigned base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for(j=0;j<1024;j++) { unsigned va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } return npte; } pte = pmap_pte_quick( pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } return npte; } #endif #if defined(DEBUG) static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; unsigned *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { pv_table_t *ppv; register pv_entry_t pv; printf("pa %x", pa); ppv = pa_to_pvh(pa); for (pv = TAILQ_FIRST(&ppv->pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { #ifdef used_to_be printf(" -> pmap %p, va %x, flags %x", (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 40699) +++ head/sys/kern/kern_exec.c (revision 40700) @@ -1,758 +1,758 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: kern_exec.c,v 1.86 1998/09/04 08:06:55 dfr Exp $ + * $Id: kern_exec.c,v 1.87 1998/10/16 03:55:00 peter Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static long *exec_copyout_strings __P((struct image_params *)); static struct ps_strings *ps_strings = PS_STRINGS; SYSCTL_INTPTR(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, ""); static caddr_t usrstack = (caddr_t)USRSTACK; SYSCTL_INTPTR(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, ""); /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif /* * execve() system call. */ int execve(p, uap) struct proc *p; register struct execve_args *uap; { struct nameidata nd, *ndp; long *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; imgp = &image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->uap = uap; imgp->attr = &attr; imgp->argc = imgp->envc = 0; imgp->argv0 = NULL; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name[0] = '\0'; imgp->auxargs = NULL; imgp->vp = NULL; imgp->firstpage = NULL; /* * Allocate temporary demand zeroed space for argument and * environment strings */ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE); if (imgp->stringbase == NULL) { error = ENOMEM; goto exec_fail; } imgp->stringp = imgp->stringbase; imgp->stringspace = ARG_MAX; imgp->image_header = imgp->stringbase + ARG_MAX; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. */ ndp = &nd; NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_USERSPACE, uap->fname, p); interpret: error = namei(ndp); if (error) { kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX + PAGE_SIZE); goto exec_fail; } imgp->vp = ndp->ni_vp; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) { VOP_UNLOCK(imgp->vp, 0, p); goto exec_fail_dealloc; } error = exec_map_first_page(imgp); VOP_UNLOCK(imgp->vp, 0, p); if (error) goto exec_fail_dealloc; /* * Loop through list of image activators, calling each one. * If there is no match, the activator returns -1. If there * is a match, but there was an error during the activation, * the error is returned. Otherwise 0 means success. If the * image is interpreted, loop back up and try activating * the interpreter. */ for (i = 0; execsw[i]; ++i) { if (execsw[i]->ex_imgact) error = (*execsw[i]->ex_imgact)(imgp); else continue; if (error == -1) continue; if (error) goto exec_fail_dealloc; if (imgp->interpreted) { exec_unmap_first_page(imgp); /* free old vnode and name buffer */ vrele(ndp->ni_vp); zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, p); goto interpret; } break; } /* If we made it through all the activators and none matched, exit. */ if (error == -1) { error = ENOEXEC; goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base */ stack_base = exec_copyout_strings(imgp); p->p_vmspace->vm_minsaddr = (char *)stack_base; /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; tmp = fdcopy(p); fdfree(p); p->p_fd = tmp; } /* close files on exec */ fdcloseexec(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup((caddr_t)p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. */ if ((attr.va_mode & VSUID && p->p_ucred->cr_uid != attr.va_uid || attr.va_mode & VSGID && p->p_ucred->cr_gid != attr.va_gid) && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. */ if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) { p->p_traceflag = 0; vrele(p->p_tracep); p->p_tracep = NULL; } /* * Set the new credentials. */ p->p_ucred = crcopy(p->p_ucred); if (attr.va_mode & VSUID) p->p_ucred->cr_uid = attr.va_uid; if (attr.va_mode & VSGID) p->p_ucred->cr_gid = attr.va_gid; setsugid(p); } else { if (p->p_ucred->cr_uid == p->p_cred->p_ruid && p->p_ucred->cr_gid == p->p_cred->p_rgid) p->p_flag &= ~P_SUGID; } /* * Implement correct POSIX saved-id behavior. */ p->p_cred->p_svuid = p->p_ucred->cr_uid; p->p_cred->p_svgid = p->p_ucred->cr_gid; /* * Store the vp for use in procfs */ if (p->p_textvp) /* release old reference */ vrele(p->p_textvp); VREF(ndp->ni_vp); p->p_textvp = ndp->ni_vp; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. */ STOPEVENT(p, S_EXEC, 0); if (p->p_flag & P_TRACED) psignal(p, SIGTRAP); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* Set entry address */ setregs(p, imgp->entry_addr, (u_long)(uintptr_t)stack_base); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage) exec_unmap_first_page(imgp); if (imgp->stringbase != NULL) kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX + PAGE_SIZE); if (ndp->ni_vp) { vrele(ndp->ni_vp); zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); } if (error == 0) return (0); exec_fail: if (imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(p, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ return(0); } else { return(error); } } int exec_map_first_page(imgp) struct image_params *imgp; { int s, rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage) { exec_unmap_first_page(imgp); } object = imgp->vp->v_object; s = splvm(); ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if (ma[i] = vm_page_lookup(object, i)) { if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) break; if (ma[i]->valid) break; vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { vm_page_protect(ma[0], VM_PROT_NONE); vm_page_free(ma[0]); } splx(s); return EIO; } } vm_page_wire(ma[0]); vm_page_wakeup(ma[0]); splx(s); pmap_kenter((vm_offset_t) imgp->image_header, VM_PAGE_TO_PHYS(ma[0])); imgp->firstpage = ma[0]; return 0; } void exec_unmap_first_page(imgp) struct image_params *imgp; { if (imgp->firstpage) { pmap_kremove((vm_offset_t) imgp->image_header); - vm_page_unwire(imgp->firstpage); + vm_page_unwire(imgp->firstpage, 1); imgp->firstpage = NULL; } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp) struct image_params *imgp; { int error; struct vmspace *vmspace = imgp->proc->p_vmspace; caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ); vm_map_t map = &vmspace->vm_map; imgp->vmspace_destroyed = 1; /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ if (vmspace->vm_refcnt == 1) { if (vmspace->vm_shm) shmexit(imgp->proc); pmap_remove_pages(&vmspace->vm_pmap, 0, USRSTACK); vm_map_remove(map, 0, USRSTACK); } else { vmspace_exec(imgp->proc); vmspace = imgp->proc->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ error = vm_map_insert(&vmspace->vm_map, NULL, 0, (vm_offset_t) stack_addr, (vm_offset_t) USRSTACK, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return (error); vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT; /* Initialize maximum stack address */ vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ; return(0); } /* * Copy out argument and environment strings from the old process * address space into the temporary string buffer. */ int exec_extract_strings(imgp) struct image_params *imgp; { char **argv, **envv; char *argp, *envp; int error; size_t length; /* * extract arguments first */ argv = imgp->uap->argv; if (argv) { argp = (caddr_t) (intptr_t) fuword(argv); if (argp == (caddr_t) -1) return (EFAULT); if (argp) argv++; if (imgp->argv0) argp = imgp->argv0; if (argp) { do { if (argp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(argp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->argc++; } while ((argp = (caddr_t) (intptr_t) fuword(argv++))); } } /* * extract environment strings */ envv = imgp->uap->envv; if (envv) { while ((envp = (caddr_t) (intptr_t) fuword(envv++))) { if (envp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(envp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->envc++; } } return (0); } /* * Copy strings out to the new process address space, constructing * new arg and env vector tables. Return a pointer to the base * so that it can be used as the initial stack pointer. */ long * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; long *stack_base; struct ps_strings *arginfo; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ arginfo = PS_STRINGS; szsigcode = *(imgp->proc->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(imgp->proc->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) /* * The '+ 2' is for the null pointers at the end of each of the * arg and env vector sets, and 'AT_COUNT*2' is room for the * ELF Auxargs data. */ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + AT_COUNT*2) * sizeof(char*)); else /* * The '+ 2' is for the null pointers at the end of each of the * arg and env vector sets */ vectp = (char **) (destp - (imgp->argc + imgp->envc + 2) * sizeof(char*)); /* * vectp also becomes our initial stack base */ stack_base = (long *)vectp; stringp = imgp->stringbase; argc = imgp->argc; envc = imgp->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer seperates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct proc *p = imgp->proc; struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; int error; /* Get file attributes */ error = VOP_GETATTR(vp, attr, p->p_ucred, p); if (error) return (error); /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) { return (EACCES); } /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, p->p_ucred, p); if (error) return (error); return (0); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return ENOMEM; xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return 0; } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return ENOENT; for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return ENOMEM; xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return 0; } Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 40699) +++ head/sys/kern/sys_pipe.c (revision 40700) @@ -1,1100 +1,1100 @@ /* * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. * - * $Id: sys_pipe.c,v 1.42 1998/06/07 17:11:39 dfr Exp $ + * $Id: sys_pipe.c,v 1.43 1998/10/13 08:24:40 dg Exp $ */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and * the receiving process can copy it directly from the pages in the sending * process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. PIPE_SIZE is constrained by the * amount of kernel virtual memory. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ /* * interfaces to the outside world */ static int pipe_read __P((struct file *fp, struct uio *uio, struct ucred *cred)); static int pipe_write __P((struct file *fp, struct uio *uio, struct ucred *cred)); static int pipe_close __P((struct file *fp, struct proc *p)); static int pipe_poll __P((struct file *fp, int events, struct ucred *cred, struct proc *p)); static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p)); static struct fileops pipeops = { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_close }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) /* * Maximum amount of kva for pipes -- this is kind-of a soft limit, but * is there so that on large systems, we don't exhaust it. */ #define MAXPIPEKVA (8*1024*1024) /* * Limit for direct transfers, we cannot, of course limit * the amount of kva for pipes in general though. */ #define LIMITPIPEKVA (16*1024*1024) /* * Limit the number of "big" pipes */ #define LIMITBIGPIPES 32 static int nbigpipe; static int amountpipekva; static void pipeclose __P((struct pipe *cpipe)); static void pipeinit __P((struct pipe *cpipe)); static __inline int pipelock __P((struct pipe *cpipe, int catch)); static __inline void pipeunlock __P((struct pipe *cpipe)); static __inline void pipeselwakeup __P((struct pipe *cpipe)); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); static void pipe_destroy_write_buffer __P((struct pipe *wpipe)); static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); static void pipe_clone_write_buffer __P((struct pipe *wpipe)); #endif static void pipespace __P((struct pipe *cpipe)); static vm_zone_t pipe_zone; /* * The pipe system call for the DTYPE_PIPE type of pipes */ /* ARGSUSED */ int pipe(p, uap) struct proc *p; struct pipe_args /* { int dummy; } */ *uap; { register struct filedesc *fdp = p->p_fd; struct file *rf, *wf; struct pipe *rpipe, *wpipe; int fd, error; if (pipe_zone == NULL) pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4); rpipe = zalloc( pipe_zone); pipeinit(rpipe); rpipe->pipe_state |= PIPE_DIRECTOK; wpipe = zalloc( pipe_zone); pipeinit(wpipe); wpipe->pipe_state |= PIPE_DIRECTOK; error = falloc(p, &rf, &fd); if (error) goto free2; p->p_retval[0] = fd; rf->f_flag = FREAD | FWRITE; rf->f_type = DTYPE_PIPE; rf->f_ops = &pipeops; rf->f_data = (caddr_t)rpipe; error = falloc(p, &wf, &fd); if (error) goto free3; wf->f_flag = FREAD | FWRITE; wf->f_type = DTYPE_PIPE; wf->f_ops = &pipeops; wf->f_data = (caddr_t)wpipe; p->p_retval[1] = fd; rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; return (0); free3: ffree(rf); fdp->fd_ofiles[p->p_retval[0]] = 0; free2: (void)pipeclose(wpipe); (void)pipeclose(rpipe); return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable */ static void pipespace(cpipe) struct pipe *cpipe; { int npages, error; npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE; /* * Create an object, I don't like the idea of paging to/from * kernel_object. * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. */ cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages); cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map); /* * Insert the object into the kernel map, and allocate kva for it. * The map entry is, by default, pageable. * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. */ error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0, (vm_offset_t *) &cpipe->pipe_buffer.buffer, cpipe->pipe_buffer.size, 1, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error); amountpipekva += cpipe->pipe_buffer.size; } /* * initialize and allocate VM and memory for pipe */ static void pipeinit(cpipe) struct pipe *cpipe; { cpipe->pipe_buffer.in = 0; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = 0; cpipe->pipe_buffer.size = PIPE_SIZE; /* Buffer kva gets dynamically allocated */ cpipe->pipe_buffer.buffer = NULL; /* cpipe->pipe_buffer.object = invalid */ cpipe->pipe_state = 0; cpipe->pipe_peer = NULL; cpipe->pipe_busy = 0; getnanotime(&cpipe->pipe_ctime); cpipe->pipe_atime = cpipe->pipe_ctime; cpipe->pipe_mtime = cpipe->pipe_ctime; bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel); cpipe->pipe_pgid = NO_PID; #ifndef PIPE_NODIRECT /* * pipe data structure initializations to support direct pipe I/O */ cpipe->pipe_map.cnt = 0; cpipe->pipe_map.kva = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; /* cpipe->pipe_map.ms[] = invalid */ #endif } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; while (cpipe->pipe_state & PIPE_LOCK) { cpipe->pipe_state |= PIPE_LWANT; if (error = tsleep( cpipe, catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) { return error; } } cpipe->pipe_state |= PIPE_LOCK; return 0; } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { cpipe->pipe_state &= ~PIPE_LOCK; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } static __inline void pipeselwakeup(cpipe) struct pipe *cpipe; { struct proc *p; if (cpipe->pipe_state & PIPE_SEL) { cpipe->pipe_state &= ~PIPE_SEL; selwakeup(&cpipe->pipe_sel); } if (cpipe->pipe_state & PIPE_ASYNC) { if (cpipe->pipe_pgid < 0) gsignal(-cpipe->pipe_pgid, SIGIO); else if ((p = pfind(cpipe->pipe_pgid)) != NULL) psignal(p, SIGIO); } } /* ARGSUSED */ static int pipe_read(fp, uio, cred) struct file *fp; struct uio *uio; struct ucred *cred; { struct pipe *rpipe = (struct pipe *) fp->f_data; int error = 0; int nread = 0; u_int size; ++rpipe->pipe_busy; while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; if ((error = pipelock(rpipe,1)) == 0) { error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); pipeunlock(rpipe); } if (error) { break; } rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { caddr_t va; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; if ((error = pipelock(rpipe,1)) == 0) { va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; error = uiomove(va, size, uio); pipeunlock(rpipe); } if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~PIPE_DIRECTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition */ if (rpipe->pipe_state & PIPE_EOF) { /* XXX error = ? */ break; } /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } if (nread > 0) break; if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if ((error = pipelock(rpipe,1)) == 0) { if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } pipeunlock(rpipe); } else { break; } if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } rpipe->pipe_state |= PIPE_WANTR; if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) { break; } } } if (error == 0) getnanotime(&rpipe->pipe_atime); --rpipe->pipe_busy; if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { if ((error == 0) && (error = pipelock(rpipe,1)) == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; pipeunlock(rpipe); } } /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); return error; } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { u_int size; int i; vm_offset_t addr, endaddr, paddr; size = (u_int) uio->uio_iov->iov_len; if (size > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); addr < endaddr; addr += PAGE_SIZE, i+=1) { vm_page_t m; vm_fault_quick( (caddr_t) addr, VM_PROT_READ); paddr = pmap_kextract(addr); if (!paddr) { int j; for(j=0;jpipe_map.ms[j]); + vm_page_unwire(wpipe->pipe_map.ms[j], 1); return EFAULT; } m = PHYS_TO_VM_PAGE(paddr); vm_page_wire(m); wpipe->pipe_map.ms[i] = m; } /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and map the buffer */ if (wpipe->pipe_map.kva == 0) { /* * We need to allocate space for an extra page because the * address range might (will) span pages at times. */ wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, wpipe->pipe_buffer.size + PAGE_SIZE); amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; } pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, wpipe->pipe_map.npages); /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base += size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return 0; } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { int i; if (wpipe->pipe_map.kva) { pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); if (amountpipekva > MAXPIPEKVA) { vm_offset_t kva = wpipe->pipe_map.kva; wpipe->pipe_map.kva = 0; kmem_free(kernel_map, kva, wpipe->pipe_buffer.size + PAGE_SIZE); amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; } } for (i=0;ipipe_map.npages;i++) - vm_page_unwire(wpipe->pipe_map.ms[i]); + vm_page_unwire(wpipe->pipe_map.ms[i], 1); } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { int size; int pos; size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; bcopy((caddr_t) wpipe->pipe_map.kva+pos, (caddr_t) wpipe->pipe_buffer.buffer, size); wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: while (wpipe->pipe_state & PIPE_DIRECTW) { if ( wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; error = tsleep(wpipe, PRIBIO|PCATCH, "pipdww", 0); if (error) goto error1; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; goto error1; } } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if ( wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwc", 0); if (error) goto error1; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; goto error1; } goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; error = pipe_build_write_buffer(wpipe, uio); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipelock(wpipe, 0); pipe_destroy_write_buffer(wpipe); pipeunlock(wpipe); pipeselwakeup(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0); } pipelock(wpipe,0); if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return error; error1: wakeup(wpipe); return error; } #endif static int pipe_write(fp, uio, cred) struct file *fp; struct uio *uio; struct ucred *cred; { int error = 0; int orig_resid; struct pipe *wpipe, *rpipe; rpipe = (struct pipe *) fp->f_data; wpipe = rpipe->pipe_peer; /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { return EPIPE; } /* * If it is advantageous to resize the pipe buffer, do * so. */ if ((uio->uio_resid > PIPE_SIZE) && (nbigpipe < LIMITBIGPIPES) && (wpipe->pipe_state & PIPE_DIRECTW) == 0 && (wpipe->pipe_buffer.size <= PIPE_SIZE) && (wpipe->pipe_buffer.cnt == 0)) { if (wpipe->pipe_buffer.buffer) { amountpipekva -= wpipe->pipe_buffer.size; kmem_free(kernel_map, (vm_offset_t)wpipe->pipe_buffer.buffer, wpipe->pipe_buffer.size); } #ifndef PIPE_NODIRECT if (wpipe->pipe_map.kva) { amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; kmem_free(kernel_map, wpipe->pipe_map.kva, wpipe->pipe_buffer.size + PAGE_SIZE); } #endif wpipe->pipe_buffer.in = 0; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = 0; wpipe->pipe_buffer.size = BIG_PIPE_SIZE; wpipe->pipe_buffer.buffer = NULL; ++nbigpipe; #ifndef PIPE_NODIRECT wpipe->pipe_map.cnt = 0; wpipe->pipe_map.kva = 0; wpipe->pipe_map.pos = 0; wpipe->pipe_map.npages = 0; #endif } if( wpipe->pipe_buffer.buffer == NULL) { if ((error = pipelock(wpipe,1)) == 0) { pipespace(wpipe); pipeunlock(wpipe); } else { return error; } } ++wpipe->pipe_busy; orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. */ if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && (fp->f_flag & FNONBLOCK) == 0 && (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { error = pipe_direct_write( wpipe, uio); if (error) { break; } continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. */ retrywrite: while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0); if (error) break; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ /* XXX perhaps they need to be contiguous to be atomic? */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { /* * This set the maximum transfer as a segment of * the buffer. */ int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; /* * space is the size left in the buffer */ if (size > space) size = space; /* * now limit it to the size of the uio transfer */ if (size > uio->uio_resid) size = uio->uio_resid; if ((error = pipelock(wpipe,1)) == 0) { /* * It is possible for a direct write to * slip in on us... handle it here... */ if (wpipe->pipe_state & PIPE_DIRECTW) { pipeunlock(wpipe); goto retrywrite; } error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], size, uio); pipeunlock(wpipe); } if (error) break; wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) wpipe->pipe_buffer.in = 0; wpipe->pipe_buffer.cnt += size; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) { break; } /* * If read side wants to go away, we just issue a signal * to ourselves. */ if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } } } --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if I/O was successful */ if ((wpipe->pipe_buffer.cnt == 0) && (uio->uio_resid == 0) && (error == EPIPE)) error = 0; if (error == 0) getnanotime(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); return error; } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ int pipe_ioctl(fp, cmd, data, p) struct file *fp; u_long cmd; register caddr_t data; struct proc *p; { register struct pipe *mpipe = (struct pipe *)fp->f_data; switch (cmd) { case FIONBIO: return (0); case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } return (0); case FIONREAD: if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; return (0); case TIOCSPGRP: mpipe->pipe_pgid = *(int *)data; return (0); case TIOCGPGRP: *(int *)data = mpipe->pipe_pgid; return (0); } return (ENOTTY); } int pipe_poll(fp, events, cred, p) struct file *fp; int events; struct ucred *cred; struct proc *p; { register struct pipe *rpipe = (struct pipe *)fp->f_data; struct pipe *wpipe; int revents = 0; wpipe = rpipe->pipe_peer; if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0) || (rpipe->pipe_state & PIPE_EOF)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || ((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF) revents |= events & (POLLOUT | POLLWRNORM); if ((rpipe->pipe_state & PIPE_EOF) || (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(p, &rpipe->pipe_sel); rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(p, &wpipe->pipe_sel); wpipe->pipe_state |= PIPE_SEL; } } return (revents); } int pipe_stat(pipe, ub) register struct pipe *pipe; register struct stat *ub; { bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = pipe->pipe_buffer.size; ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = pipe->pipe_ctime; /* * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev, * st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ return 0; } /* ARGSUSED */ static int pipe_close(fp, p) struct file *fp; struct proc *p; { struct pipe *cpipe = (struct pipe *)fp->f_data; pipeclose(cpipe); fp->f_data = NULL; return 0; } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipe *ppipe; if (cpipe) { pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT|PIPE_EOF; tsleep(cpipe, PRIBIO, "pipecl", 0); } /* * Disconnect from peer */ if (ppipe = cpipe->pipe_peer) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); ppipe->pipe_peer = NULL; } /* * free resources */ if (cpipe->pipe_buffer.buffer) { if (cpipe->pipe_buffer.size > PIPE_SIZE) --nbigpipe; amountpipekva -= cpipe->pipe_buffer.size; kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer, cpipe->pipe_buffer.size); } #ifndef PIPE_NODIRECT if (cpipe->pipe_map.kva) { amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; kmem_free(kernel_map, cpipe->pipe_map.kva, cpipe->pipe_buffer.size + PAGE_SIZE); } #endif zfree(pipe_zone, cpipe); } } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 40699) +++ head/sys/kern/vfs_bio.c (revision 40700) @@ -1,2422 +1,2420 @@ /* * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.179 1998/10/13 08:24:40 dg Exp $ + * $Id: vfs_bio.c,v 1.180 1998/10/25 17:44:52 phk Exp $ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. */ #define VMIO #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ #if 0 /* replaced bu sched_sync */ static void vfs_update __P((void)); static struct proc *updateproc; static struct kproc_desc up_kp = { "update", vfs_update, &updateproc }; SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) #endif struct buf *buf; /* buffer header pool */ struct swqueue bswlist; static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static void flushdirtybuffers(int slpflag, int slptimeo); int needsbuffer; /* * Internal update daemon, process 3 * The variable vfs_update_wakeup allows for internal syncs. */ int vfs_update_wakeup; /* * buffers base kva */ /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; static vm_offset_t bogus_offset; static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, bufmallocspace, maxbufmallocspace; int numdirtybuffers; static int lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; static int kvafreespace; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, &maxvmiobufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, &vmiospace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, &kvafreespace, 0, ""); static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; struct bqueues bufqueues[BUFFER_QUEUES] = {0}; extern int vm_swap_size; #define BUF_MAXUSE 24 #define VFS_BIO_NEED_ANY 1 #define VFS_BIO_NEED_LOWLIMIT 2 #define VFS_BIO_NEED_FREE 4 /* * Initialize buffer headers and related structures. */ void bufinit() { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); /* first, make a null hash table */ for (i = 0; i < BUFHSZ; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vnbufs.le_next = NOLIST; LIST_INIT(&bp->b_dep); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * maxbufspace is currently calculated to support all filesystem blocks * to be 8K. If you happen to use a 16K filesystem, the size of the buffer * cache is still the same as it would be for 8K filesystems. This * keeps the size of the buffer cache "in check" for big block filesystems. */ maxbufspace = (nbuf + 8) * DFLTBSIZE; /* * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed */ maxvmiobufspace = 2 * maxbufspace / 3; /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = maxbufspace / 20; /* * Remove the probability of deadlock conditions by limiting the * number of dirty buffers. */ hidirtybuffers = nbuf / 8 + 20; lodirtybuffers = nbuf / 16 + 10; numdirtybuffers = 0; lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; kvafreespace = 0; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); } /* * Free the kva allocation for a buffer * Must be called only at splbio or higher, * as this is the only locking for buffer_map. */ static void bfreekva(struct buf * bp) { if (bp->b_kvasize == 0) return; vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize); bp->b_kvasize = 0; } /* * remove the buffer from the appropriate free list */ void bremfree(struct buf * bp) { int s = splbio(); if (bp->b_qindex != QUEUE_NONE) { if (bp->b_qindex == QUEUE_EMPTY) { kvafreespace -= bp->b_kvasize; } TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { #if !defined(MAX_PERF) panic("bremfree: removing a buffer when not on a queue"); #endif } if ((bp->b_flags & B_INVAL) || (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0) --numfreebuffers; splx(s); } /* * Get a buffer with the specified data. Look in the cache first. */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); return (biowait(bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); VOP_STRATEGY(vp, rabp); } else { brelse(rabp); } } if (readwait) { rv = biowait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async.) */ int bwrite(struct buf * bp) { int oldflags, s; struct vnode *vp; struct mount *mp; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; #if !defined(MAX_PERF) if ((bp->b_flags & B_BUSY) == 0) panic("bwrite: buffer is not busy???"); #endif bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); bp->b_flags |= B_WRITEINPROG; s = splbio(); if ((oldflags & B_DELWRI) == B_DELWRI) { --numdirtybuffers; reassignbuf(bp, bp->b_vp); } bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); if (curproc != NULL) curproc->p_stats->p_ru.ru_oublock++; splx(s); VOP_STRATEGY(bp->b_vp, bp); /* * Collect statistics on synchronous and asynchronous writes. * Writes to block devices are charged to their associated * filesystem (if any). */ if ((vp = bp->b_vp) != NULL) { if (vp->v_type == VBLK) mp = vp->v_specmountpoint; else mp = vp->v_mount; if (mp != NULL) if ((oldflags & B_ASYNC) == 0) mp->mnt_stat.f_syncwrites++; else mp->mnt_stat.f_asyncwrites++; } if ((oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); brelse(bp); return (rtval); } return (0); } __inline void vfs_bio_need_satisfy(void) { ++numfreebuffers; if (!needsbuffer) return; if (numdirtybuffers < lodirtybuffers) { needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT); } else { needsbuffer &= ~VFS_BIO_NEED_ANY; } if (numfreebuffers >= hifreebuffers) { needsbuffer &= ~VFS_BIO_NEED_FREE; } wakeup(&needsbuffer); } /* * Delayed write. (Buffer is marked dirty). */ void bdwrite(struct buf * bp) { struct vnode *vp; #if !defined(MAX_PERF) if ((bp->b_flags & B_BUSY) == 0) { panic("bdwrite: buffer is not busy"); } #endif if (bp->b_flags & B_INVAL) { brelse(bp); return; } bp->b_flags &= ~(B_READ|B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; } /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (bp->b_lblkno == bp->b_blkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); /* * XXX The soft dependency code is not prepared to * have I/O done when a bdwrite is requested. For * now we just let the write be delayed if it is * requested by the soft dependency code. */ if ((vp = bp->b_vp) && (vp->v_type == VBLK && vp->v_specmountpoint && (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))) return; if (numdirtybuffers >= hidirtybuffers) flushdirtybuffers(0, 0); return; } /* * Same as first half of bdwrite, mark buffer dirty, but do not release it. * Check how this compares with vfs_setdirty(); XXX [JRE] */ void bdirty(bp) struct buf *bp; { bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */ if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */ reassignbuf(bp, bp->b_vp); ++numdirtybuffers; } } /* * Asynchronous write. * Start output on a buffer, but do not wait for it to complete. * The buffer is released when the output completes. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) VOP_BWRITE(bp); } /* * Ordered write. * Start output on a buffer, and flag it so that the device will write * it in the order it was queued. The buffer is released when the output * completes. */ int bowrite(struct buf * bp) { bp->b_flags |= B_ORDERED|B_ASYNC; return (VOP_BWRITE(bp)); } /* * Release a buffer. */ void brelse(struct buf * bp) { int s; if (bp->b_flags & B_CLUSTER) { relpbuf(bp); return; } s = splbio(); /* anyone need this block? */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~(B_WANTED | B_AGE); wakeup(bp); } if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || (bp->b_bufsize <= 0)) { bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); if (bp->b_flags & B_DELWRI) --numdirtybuffers; bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, so the B_INVAL flag is used to *invalidate* the buffer, * but the VM object is kept around. The B_NOCACHE flag is used to * invalidate the pages in the VM object. * * If the buffer is a partially filled NFS buffer, keep it * since invalidating it now will lose informatio. The valid * flags in the vm_pages have only DEV_BSIZE resolution but * the b_validoff, b_validend fields have byte resolution. * This can avoid unnecessary re-reads of the buffer. * XXX this seems to cause performance problems. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK && (bp->b_flags & B_DELWRI) != 0) #ifdef notdef && (bp->b_vp->v_tag != VT_NFS || bp->b_vp->v_type == VBLK || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || bp->b_validend == 0 || (bp->b_validoff == 0 && bp->b_validend == bp->b_bufsize)) #endif ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; struct vnode *vp; vp = bp->b_vp; resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if (m == bogus_page) { obj = (vm_object_t) vp->v_object; poff = OFF_TO_IDX(bp->b_offset); for (j = i; j < bp->b_npages; j++) { m = bp->b_pages[j]; if (m == bogus_page) { m = vm_page_lookup(obj, poff + j); #if !defined(MAX_PERF) if (!m) { panic("brelse: page missing\n"); } #endif bp->b_pages[j] = m; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } if (bp->b_flags & (B_NOCACHE|B_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; vm_page_set_invalid(m, poffset, presid); } resid -= PAGE_SIZE; } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); #endif /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; bp->b_qindex = QUEUE_EMPTY; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; kvafreespace += bp->b_kvasize; /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { bp->b_flags |= B_INVAL; bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if (bp->b_flags & B_AGE) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); /* buffers with valid and quite potentially reuseable contents */ } else { bp->b_qindex = QUEUE_LRU; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } if ((bp->b_flags & B_INVAL) || (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { if (bp->b_flags & B_DELWRI) { --numdirtybuffers; bp->b_flags &= ~B_DELWRI; } vfs_bio_need_satisfy(); } /* unlock */ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } /* * Release a buffer. */ void bqrelse(struct buf * bp) { int s; s = splbio(); /* anyone need this block? */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~(B_WANTED | B_AGE); wakeup(bp); } #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); #endif if (bp->b_flags & B_LOCKED) { bp->b_flags &= ~B_ERROR; bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else { bp->b_qindex = QUEUE_LRU; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { vfs_bio_need_satisfy(); } /* unlock */ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } static void vfs_vmio_release(bp) struct buf *bp; { int i; vm_page_t m; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; - vm_page_unwire(m); + if ((bp->b_flags & B_ASYNC) == 0) + vm_page_unwire(m, (bp->b_flags & B_ASYNC) == 0 ? 0 : 1); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->flags & PG_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { + vm_page_flag_clear(m, PG_ZERO); /* * If this is an async free -- we cannot place * pages onto the cache queue. If it is an * async free, then we don't modify any queues. * This is probably in error (for perf reasons), * and we will eventually need to build * a more complete infrastructure to support I/O * rundown. */ if ((bp->b_flags & B_ASYNC) == 0) { /* * In the case of sync buffer frees, we can do pretty much * anything to any of the memory queues. Specifically, * the cache queue is okay to be modified. */ if (m->valid) { if(m->dirty == 0) vm_page_test_dirty(m); /* * this keeps pressure off of the process memory */ if (m->dirty == 0 && m->hold_count == 0) vm_page_cache(m); - else - vm_page_deactivate(m); - vm_page_flag_clear(m, PG_ZERO); } else if (m->hold_count == 0) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } } else { /* * If async, then at least we clear the * act_count. */ m->act_count = 0; - vm_page_flag_clear(m, PG_ZERO); } } } bufspace -= bp->b_bufsize; vmiospace -= bp->b_bufsize; pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); bp->b_npages = 0; bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block is currently memory resident. */ struct buf * gbincore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; bh = BUFHASH(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp != NULL) { /* hit */ if (bp->b_vp == vp && bp->b_lblkno == blkno && (bp->b_flags & B_INVAL) == 0) { break; } bp = bp->b_hash.le_next; } return (bp); } /* * this routine implements clustered async writes for * clearing out B_DELWRI buffers... This is much better * than the old way of writing only one buffer at a time. */ int vfs_bio_awrite(struct buf * bp) { int i; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; int nwritten; int size; int maxcl; s = splbio(); /* * right now we support clustered writing only to regular files */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { if ((bpa = gbincore(vp, lblkno + i)) && ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) break; } else { break; } } ncl = i; /* * this is a possible cluster write */ if (ncl != 1) { nwritten = cluster_wbuild(vp, size, lblkno, ncl); splx(s); return nwritten; } } bremfree(bp); bp->b_flags |= B_BUSY | B_ASYNC; splx(s); /* * default (old) behavior, writing out only one block */ nwritten = bp->b_bufsize; (void) VOP_BWRITE(bp); return nwritten; } /* * Find a buffer header which is available for use. */ static struct buf * getnewbuf(struct vnode *vp, daddr_t blkno, int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp, *bp1; int nbyteswritten = 0; vm_offset_t addr; static int writerecursion = 0; start: if (bufspace >= maxbufspace) goto trytofreespace; /* can we constitute a new buffer? */ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_EMPTY) panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", bp->b_qindex); #endif bp->b_flags |= B_BUSY; bremfree(bp); goto fillbuf; } trytofreespace: /* * We keep the file I/O from hogging metadata I/O * This is desirable because file data is cached in the * VM/Buffer cache even if a buffer is freed. */ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_AGE) panic("getnewbuf: inconsistent AGE queue, qindex=%d", bp->b_qindex); #endif } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue, qindex=%d", bp->b_qindex); #endif } if (!bp) { /* wait for a free buffer of any kind */ needsbuffer |= VFS_BIO_NEED_ANY; do tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); while (needsbuffer & VFS_BIO_NEED_ANY); return (0); } #if defined(DIAGNOSTIC) if (bp->b_flags & B_BUSY) { panic("getnewbuf: busy buffer on free list\n"); } #endif /* * We are fairly aggressive about freeing VMIO buffers, but since * the buffering is intact without buffer headers, there is not * much loss. We gain by maintaining non-VMIOed metadata in buffers. */ if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { if ((bp->b_flags & B_VMIO) == 0 || (vmiospace < maxvmiobufspace)) { --bp->b_usecount; TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); goto start; } TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } } /* if we are a delayed write, convert to an async write */ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { /* * If our delayed write is likely to be used soon, then * recycle back onto the LRU queue. */ if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) && (bp->b_lblkno >= blkno) && (maxsize > 0)) { if (bp->b_usecount > 0) { if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) { TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); bp->b_usecount--; goto start; } TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } } } /* * Certain layered filesystems can recursively re-enter the vfs_bio * code, due to delayed writes. This helps keep the system from * deadlocking. */ if (writerecursion > 0) { if (writerecursion > 5) { bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); while (bp) { if ((bp->b_flags & B_DELWRI) == 0) break; bp = TAILQ_NEXT(bp, b_freelist); } if (bp == NULL) { bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); while (bp) { if ((bp->b_flags & B_DELWRI) == 0) break; bp = TAILQ_NEXT(bp, b_freelist); } } if (bp == NULL) panic("getnewbuf: cannot get buffer, infinite recursion failure"); } else { bremfree(bp); bp->b_flags |= B_BUSY | B_AGE | B_ASYNC; nbyteswritten += bp->b_bufsize; ++writerecursion; VOP_BWRITE(bp); --writerecursion; if (!slpflag && !slptimeo) { return (0); } goto start; } } else { ++writerecursion; nbyteswritten += vfs_bio_awrite(bp); --writerecursion; if (!slpflag && !slptimeo) { return (0); } goto start; } } if (bp->b_flags & B_WANTED) { bp->b_flags &= ~B_WANTED; wakeup(bp); } bremfree(bp); bp->b_flags |= B_BUSY; if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); fillbuf: /* we are not free, nor do we contain interesting data */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); if (bp->b_bufsize) { allocbuf(bp, 0); } bp->b_flags = B_BUSY; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_validoff = bp->b_validend = 0; bp->b_usecount = 5; /* Here, not kern_physio.c, is where this should be done*/ LIST_INIT(&bp->b_dep); maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; /* * we assume that buffer_map is not at address 0 */ addr = 0; if (maxsize != bp->b_kvasize) { bfreekva(bp); findkvaspace: /* * See if we have buffer kva space */ if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { if (kvafreespace > 0) { int totfree = 0, freed; do { freed = 0; for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) { if (bp1->b_kvasize != 0) { totfree += bp1->b_kvasize; freed = bp1->b_kvasize; bremfree(bp1); bfreekva(bp1); brelse(bp1); break; } } } while (freed); /* * if we found free space, then retry with the same buffer. */ if (totfree) goto findkvaspace; } bp->b_flags |= B_INVAL; brelse(bp); goto trytofreespace; } } /* * See if we are below are allocated minimum */ if (bufspace >= (maxbufspace + nbyteswritten)) { bp->b_flags |= B_INVAL; brelse(bp); goto trytofreespace; } /* * create a map entry for the buffer -- in essence * reserving the kva space. */ if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; } bp->b_data = bp->b_kvabase; return (bp); } static void waitfreebuffers(int slpflag, int slptimeo) { while (numfreebuffers < hifreebuffers) { flushdirtybuffers(slpflag, slptimeo); if (numfreebuffers < hifreebuffers) break; needsbuffer |= VFS_BIO_NEED_FREE; if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) break; } } static void flushdirtybuffers(int slpflag, int slptimeo) { int s; static pid_t flushing = 0; s = splbio(); if (flushing) { if (flushing == curproc->p_pid) { splx(s); return; } while (flushing) { if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) { splx(s); return; } } } flushing = curproc->p_pid; while (numdirtybuffers > lodirtybuffers) { struct buf *bp; needsbuffer |= VFS_BIO_NEED_LOWLIMIT; bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); if (bp == NULL) bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); while (bp && ((bp->b_flags & B_DELWRI) == 0)) { bp = TAILQ_NEXT(bp, b_freelist); } if (bp) { vfs_bio_awrite(bp); continue; } break; } flushing = 0; wakeup(&flushing); splx(s); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; int s = splbio(); bp = gbincore(vp, blkno); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc; vm_page_t m; vm_ooffset_t off; if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0) return 0; obj = vp->v_object; tinc = PAGE_SIZE; if (tinc > vp->v_mount->mnt_stat.f_iosize) tinc = vp->v_mount->mnt_stat.f_iosize; off = blkno * vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) return 0; if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) return 0; } return 1; } /* * now we set the dirty range for the buffer -- * for NFS -- if the file is mapped and pages have * been written to, let it know. We want the * entire range of the buffer to be marked dirty if * any of the pages have been written to for consistancy * with the b_validoff, b_validend set in the nfs write * code, and used by the nfs read code. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; vm_offset_t boffset, offset; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) && ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) { vm_page_flag_clear(bp->b_pages[i], PG_ZERO); vm_page_test_dirty(bp->b_pages[i]); } /* * scan forwards for the first page modified */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) { break; } } boffset = (i << PAGE_SHIFT); if (boffset < bp->b_dirtyoff) { bp->b_dirtyoff = boffset; } /* * scan backwards for the last page modified */ for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } boffset = (i + 1); offset = boffset + bp->b_pages[0]->pindex; if (offset >= object->size) boffset = object->size - bp->b_pages[0]->pindex; if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) bp->b_dirtyend = (boffset << PAGE_SHIFT); } } /* * Get a block given a specified block and offset into a file/device. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int i, s; struct bufhashhdr *bh; int maxsize; int checksize; if (vp->v_mount) { maxsize = vp->v_mount->mnt_stat.f_iosize; /* * This happens on mount points. */ if (maxsize < size) maxsize = size; } else { maxsize = size; } #if !defined(MAX_PERF) if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); #endif s = splbio(); loop: if (numfreebuffers < lofreebuffers) { waitfreebuffers(slpflag, slptimeo); } if ((bp = gbincore(vp, blkno))) { if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; if (!tsleep(bp, (PRIBIO + 4) | slpflag, "getblk", slptimeo)) { goto loop; } splx(s); return (struct buf *) NULL; } bp->b_flags |= B_BUSY | B_CACHE; bremfree(bp); /* * check for size inconsistancies (note that they shouldn't * happen but do when filesystems don't handle the size changes * correctly.) We are conservative on metadata and don't just * extend the buffer but write (if needed) and re-constitute it. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) { allocbuf(bp, size); } else { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp); } else { if ((bp->b_flags & B_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp); } } goto loop; } } #ifdef DIAGNOSTIC if (bp->b_offset == NOOFFSET) panic("getblk: no buffer offset"); #endif /* * Check that the constituted buffer really deserves for the * B_CACHE bit to be set. B_VMIO type buffers might not * contain fully valid pages. Normal (old-style) buffers * should be fully valid. */ if (bp->b_flags & B_VMIO) { checksize = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { int resid; int poffset; poffset = bp->b_offset & PAGE_MASK; resid = (checksize > (PAGE_SIZE - poffset)) ? (PAGE_SIZE - poffset) : checksize; if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) { bp->b_flags &= ~(B_CACHE | B_DONE); break; } checksize -= resid; } } if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; splx(s); return (bp); } else { vm_object_t obj; if ((bp = getnewbuf(vp, blkno, slpflag, slptimeo, size, maxsize)) == 0) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * Normally the vnode is locked so this isn't a problem. * VBLK type I/O requests, however, don't lock the vnode. */ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; if (vp->v_type != VBLK) bp->b_offset = (off_t) blkno * maxsize; else bp->b_offset = (off_t) blkno * DEV_BSIZE; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) { bp->b_flags |= (B_VMIO | B_CACHE); #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG && vp->v_type != VBLK) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif } else { bp->b_flags &= ~B_VMIO; } allocbuf(bp, size); splx(s); return (bp); } } /* * Get an empty, disassociated buffer of given size. */ struct buf * geteblk(int size) { struct buf *bp; int s; s = splbio(); while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0); splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * * Modify the length of a buffer's underlying buffer storage without * destroying information (unless, of course the buffer is shrinking). */ int allocbuf(struct buf * bp, int size) { int s; int newbsize, mbsize; int i; #if !defined(MAX_PERF) if (!(bp->b_flags & B_BUSY)) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); #endif if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) newbsize = mbsize; else #endif newbsize = round_page(size); if (newbsize < bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; bp->b_data = bp->b_kvabase; bp->b_bufsize = 0; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } #endif vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer grows. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufspace += mbsize; bufmallocspace += mbsize; return 1; } #endif origbuf = NULL; origbufsize = 0; #if !defined(NO_B_MALLOC) /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; bp->b_bufsize = 0; bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } #endif vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); #if !defined(NO_B_MALLOC) if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } #endif } } else { vm_page_t m; int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (round_page(newbsize) >> PAGE_SHIFT); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); #endif if (newbsize < bp->b_bufsize) { if (desiredpages < bp->b_npages) { for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of vnode_pager_setsize */ m = bp->b_pages[i]; #if defined(DIAGNOSTIC) if (m == bogus_page) panic("allocbuf: bogus page found"); #endif vm_page_sleep(m, "biodep", &m->busy); bp->b_pages[i] = NULL; - vm_page_unwire(m); + vm_page_unwire(m, 0); } pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (newbsize > bp->b_bufsize) { vm_object_t obj; vm_offset_t tinc, toff; vm_ooffset_t off; vm_pindex_t objoff; int pageindex, curbpnpages; struct vnode *vp; int bsize; int orig_validoff = bp->b_validoff; int orig_validend = bp->b_validend; vp = bp->b_vp; if (vp->v_type == VBLK) bsize = DEV_BSIZE; else bsize = vp->v_mount->mnt_stat.f_iosize; if (bp->b_npages < desiredpages) { obj = vp->v_object; tinc = PAGE_SIZE; if (tinc > bsize) tinc = bsize; off = bp->b_offset; #ifdef DIAGNOSTIC if (bp->b_offset == NOOFFSET) panic("allocbuf: no buffer offset"); #endif curbpnpages = bp->b_npages; doretry: bp->b_validoff = orig_validoff; bp->b_validend = orig_validend; bp->b_flags |= B_CACHE; for (toff = 0; toff < newbsize; toff += tinc) { int bytesinpage; pageindex = toff >> PAGE_SHIFT; objoff = OFF_TO_IDX(off + toff); if (pageindex < curbpnpages) { m = bp->b_pages[pageindex]; #ifdef VFS_BIO_DIAG if (m->pindex != objoff) panic("allocbuf: page changed offset??!!!?"); #endif bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; if (bp->b_flags & B_CACHE) vfs_buf_set_valid(bp, off, toff, bytesinpage, m); continue; } m = vm_page_lookup(obj, objoff); if (!m) { m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); if (!m) { VM_WAIT; vm_pageout_deficit += (desiredpages - bp->b_npages); goto doretry; } vm_page_wire(m); vm_page_flag_clear(m, PG_BUSY); bp->b_flags &= ~B_CACHE; } else if (m->flags & PG_BUSY) { s = splvm(); if (m->flags & PG_BUSY) { vm_page_flag_set(m, PG_WANTED); tsleep(m, PVM, "pgtblk", 0); } splx(s); goto doretry; } else { if ((curproc != pageproc) && ((m->queue - m->pc) == PQ_CACHE) && ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; if (bp->b_flags & B_CACHE) vfs_buf_set_valid(bp, off, toff, bytesinpage, m); vm_page_flag_clear(m, PG_ZERO); vm_page_wire(m); } bp->b_pages[pageindex] = m; curbpnpages = pageindex + 1; } if (vp->v_tag == VT_NFS && vp->v_type != VBLK) { if (bp->b_dirtyend > 0) { bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); bp->b_validend = max(bp->b_validend, bp->b_dirtyend); } if (bp->b_validend == 0) bp->b_flags &= ~B_CACHE; } bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); bp->b_npages = curbpnpages; pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; } } } if (bp->b_flags & B_VMIO) vmiospace += (newbsize - bp->b_bufsize); bufspace += (newbsize - bp->b_bufsize); bp->b_bufsize = newbsize; bp->b_bcount = size; return 1; } /* * Wait for buffer I/O completion, returning error status. */ int biowait(register struct buf * bp) { int s; s = splbio(); while ((bp->b_flags & B_DONE) == 0) #if defined(NO_SCHEDULE_MODS) tsleep(bp, PRIBIO, "biowait", 0); #else if (bp->b_flags & B_READ) tsleep(bp, PRIBIO, "biord", 0); else tsleep(bp, PRIBIO, "biowr", 0); #endif splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_flags & B_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Finish I/O on a buffer, calling an optional function. * This is usually called from interrupt level, so process blocking * is not *a good idea*. */ void biodone(register struct buf * bp) { int s; s = splbio(); #if !defined(MAX_PERF) if (!(bp->b_flags & B_BUSY)) panic("biodone: buffer not busy"); #endif if (bp->b_flags & B_DONE) { splx(s); #if !defined(MAX_PERF) printf("biodone: buffer already done\n"); #endif return; } bp->b_flags |= B_DONE; if (bp->b_flags & B_FREEBUF) { brelse(bp); splx(s); return; } if ((bp->b_flags & B_READ) == 0) { vwakeup(bp); } /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone) (bp); splx(s); return; } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) (*bioops.io_complete)(bp); if (bp->b_flags & B_VMIO) { int i, resid; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; obj = vp->v_object; #if defined(VFS_BIO_DEBUG) if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } if (vp->v_object == NULL) { panic("biodone: missing VM object"); } if ((vp->v_flag & VOBJBUF) == 0) { panic("biodone: vnode is not setup for merged cache"); } #endif foff = bp->b_offset; #ifdef DIAGNOSTIC if (bp->b_offset == NOOFFSET) panic("biodone: no buffer offset"); #endif #if !defined(MAX_PERF) if (!obj) { panic("biodone: no object"); } #endif #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif iosize = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (!m) { #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif vm_object_pip_subtract(obj, 1); continue; } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); } #endif resid = IDX_TO_OFF(m->pindex + 1) - foff; if (resid > iosize) resid = iosize; /* * In the write case, the valid and clean bits are * already changed correctly, so we only need to do this * here in the read case. */ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } vm_page_flag_clear(m, PG_ZERO); /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { #if !defined(MAX_PERF) printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); #endif if (vp->v_type != VBLK) #if !defined(MAX_PERF) printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, (int) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", (int) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", m->valid, m->dirty, m->wire_count); #endif panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff += resid; iosize -= resid; } if (obj && (obj->paging_in_progress == 0) && (obj->flags & OBJ_PIPWNT)) { vm_object_clear_flag(obj, OBJ_PIPWNT); wakeup(obj); } } /* * For asynchronous completions, release the buffer now. The brelse * checks for B_WANTED and will do the wakeup there if necessary - so * no need to do a wakeup here in the async case. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) brelse(bp); else bqrelse(bp); } else { bp->b_flags &= ~B_WANTED; wakeup(bp); } splx(s); } #if 0 /* not with kirks code */ static int vfs_update_interval = 30; static void vfs_update() { while (1) { tsleep(&vfs_update_wakeup, PUSER, "update", hz * vfs_update_interval); vfs_update_wakeup = 0; sync(curproc, NULL); } } static int sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error) wakeup(&vfs_update_wakeup); return error; } SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); #endif /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); #if !defined(MAX_PERF) if (!m) { panic("vfs_unbusy_pages: page missing\n"); } #endif bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_flag_clear(m, PG_ZERO); vm_page_io_finish(m); } if (obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { vm_object_clear_flag(obj, OBJ_PIPWNT); wakeup(obj); } } } /* * Set NFS' b_validoff and b_validend fields from the valid bits * of a page. If the consumer is not NFS, and the page is not * valid for the entire range, clear the B_CACHE flag to force * the consumer to re-read the page. */ static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) { vm_offset_t svalid, evalid; int validbits = m->valid; /* * This only bothers with the first valid range in the * page. */ svalid = off; while (validbits && !(validbits & 1)) { svalid += DEV_BSIZE; validbits >>= 1; } evalid = svalid; while (validbits & 1) { evalid += DEV_BSIZE; validbits >>= 1; } /* * Make sure this range is contiguous with the range * built up from previous pages. If not, then we will * just use the range from the previous pages. */ if (svalid == bp->b_validend) { bp->b_validoff = min(bp->b_validoff, svalid); bp->b_validend = max(bp->b_validend, evalid); } } else if (!vm_page_is_valid(m, (vm_offset_t) ((foff + off) & PAGE_MASK), size)) { bp->b_flags &= ~B_CACHE; } } /* * Set the valid bits in a page, taking care of the b_validoff, * b_validend fields which NFS uses to optimise small reads. Off is * the offset within the file and pageno is the page index within the buf. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { struct vnode *vp = bp->b_vp; vm_ooffset_t soff, eoff; soff = off; eoff = off + min(PAGE_SIZE, bp->b_bufsize); if (vp->v_tag == VT_NFS && vp->v_type != VBLK) { vm_ooffset_t sv, ev; vm_page_set_invalid(m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff)); off = off - pageno * PAGE_SIZE; sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1)); ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1)); soff = qmax(sv, soff); eoff = qmin(ev, eoff); } if (eoff > soff) vm_page_set_validclean(m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff)); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; vm_ooffset_t foff; foff = bp->b_offset; #ifdef DIAGNOSTIC if (bp->b_offset == NOOFFSET) panic("vfs_busy_pages: no buffer offset"); #endif vfs_setdirty(bp); retry: for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (vm_page_sleep(m, "vbpage", NULL)) goto retry; } for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) { vm_page_t m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } vm_page_protect(m, VM_PROT_NONE); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (bp->b_bcount >= PAGE_SIZE) { if (m->valid && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } } } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. */ void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; foff = bp->b_offset; #ifdef DIAGNOSTIC if (bp->b_offset == NOOFFSET) panic("vfs_clean_pages: no buffer offset"); #endif for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) { vm_page_t m = bp->b_pages[i]; vfs_page_set_valid(bp, foff, i, m); } } } void vfs_bio_clrbuf(struct buf *bp) { int i; if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { int mask; mask = 0; for(i=0;ib_bufsize;i+=DEV_BSIZE) mask |= (1 << (i/DEV_BSIZE)); if(((bp->b_pages[0]->flags & PG_ZERO) == 0) && (bp->b_pages[0]->valid != mask)) { bzero(bp->b_data, bp->b_bufsize); } bp->b_pages[0]->valid = mask; bp->b_resid = 0; return; } for(i=0;ib_npages;i++) { if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) continue; if( bp->b_pages[i]->valid == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); } } else { int j; for(j=0;jb_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1<b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); } } bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(bp->b_pages[i], PG_ZERO); } bp->b_resid = 0; } else { clrbuf(bp); } } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); if (!p) { vm_pageout_deficit += (to - from) >> PAGE_SHIFT; VM_WAIT; goto tryagain; } vm_page_wire(p); p->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(p, PG_ZERO); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); bp->b_pages[index] = p; vm_page_wakeup(p); } bp->b_npages = index; } void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { #if !defined(MAX_PERF) if (p->busy) { printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", bp->b_blkno, bp->b_lblkno); } #endif bp->b_pages[index] = NULL; pmap_kremove(pg); vm_page_busy(p); - vm_page_unwire(p); + vm_page_unwire(p, 0); vm_page_free(p); } } bp->b_npages = newnpages; } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc, (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " "b_resid = %ld\nb_dev = 0x%x, b_data = %p, " "b_blkno = %d, b_pblkno = %d\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } } #endif /* DDB */ Index: head/sys/miscfs/procfs/procfs_mem.c =================================================================== --- head/sys/miscfs/procfs/procfs_mem.c (revision 40699) +++ head/sys/miscfs/procfs/procfs_mem.c (revision 40700) @@ -1,342 +1,342 @@ /* * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 Sean Eric Fagan * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry and Sean Eric Fagan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_mem.c 8.5 (Berkeley) 6/15/94 * - * $Id: procfs_mem.c,v 1.33 1998/06/07 17:11:57 dfr Exp $ + * $Id: procfs_mem.c,v 1.34 1998/07/15 02:32:19 bde Exp $ */ /* * This is a lightly hacked and merged version * of sef's pread/pwrite functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int procfs_rwmem __P((struct proc *curp, struct proc *p, struct uio *uio)); static int procfs_rwmem(curp, p, uio) struct proc *curp; struct proc *p; struct uio *uio; { int error; int writing; struct vmspace *vm; vm_map_t map; vm_object_t object = NULL; vm_offset_t pageno = 0; /* page number */ vm_prot_t reqprot; vm_offset_t kva; /* * if the vmspace is in the midst of being deallocated or the * process is exiting, don't try to grab anything. The page table * usage in that process can be messed up. */ vm = p->p_vmspace; if ((p->p_flag & P_WEXIT) || (vm->vm_refcnt < 1)) return EFAULT; ++vm->vm_refcnt; /* * The map we want... */ map = &vm->vm_map; writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) : VM_PROT_READ; kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_map_t tmap; vm_offset_t uva; int page_offset; /* offset into page */ vm_map_entry_t out_entry; vm_prot_t out_prot; boolean_t wired; vm_pindex_t pindex; u_int len; vm_page_t m; object = NULL; uva = (vm_offset_t) uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); if (uva >= VM_MAXUSER_ADDRESS) { vm_offset_t tkva; if (writing || uva >= VM_MAXUSER_ADDRESS + UPAGES * PAGE_SIZE || (ptrace_read_u_check(p, uva - (vm_offset_t) VM_MAXUSER_ADDRESS, (size_t) len) && !procfs_kmemaccess(curp))) { error = 0; break; } /* we are reading the "U area", force it into core */ PHOLD(p); /* sanity check */ if (!(p->p_flag & P_INMEM)) { /* aiee! */ PRELE(p); error = EFAULT; break; } /* populate the ptrace/procfs area */ p->p_addr->u_kproc.kp_proc = *p; fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); /* locate the in-core address */ tkva = (uintptr_t)p->p_addr + uva - VM_MAXUSER_ADDRESS; /* transfer it */ error = uiomove((caddr_t)tkva, len, uio); /* let the pages go */ PRELE(p); continue; } /* * Fault the page on behalf of the process */ error = vm_fault(map, pageno, reqprot, FALSE); if (error) { error = EFAULT; break; } /* * Now we need to get the page. out_entry, out_prot, wired, * and single_use aren't used. One would think the vm code * would be a *bit* nicer... We use tmap because * vm_map_lookup() can change the map argument. */ tmap = map; error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, &object, &pindex, &out_prot, &wired); if (error) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; break; } m = vm_page_lookup(object, pindex); /* Allow fallback to backing objects if we are reading */ while (m == NULL && !writing && object->backing_object) { pindex += OFF_TO_IDX(object->backing_object_offset); object = object->backing_object; m = vm_page_lookup(object, pindex); } if (m == NULL) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; vm_map_lookup_done(tmap, out_entry); break; } /* * Wire the page into memory */ vm_page_wire(m); /* * We're done with tmap now. * But reference the object first, so that we won't loose * it. */ vm_object_reference(object); vm_map_lookup_done(tmap, out_entry); pmap_kenter(kva, VM_PAGE_TO_PHYS(m)); /* * Now do the i/o move. */ error = uiomove((caddr_t)(kva + page_offset), len, uio); pmap_kremove(kva); /* * release the page and the object */ - vm_page_unwire(m); + vm_page_unwire(m, 1); vm_object_deallocate(object); object = NULL; } while (error == 0 && uio->uio_resid > 0); if (object) vm_object_deallocate(object); kmem_free(kernel_map, kva, PAGE_SIZE); vmspace_free(vm); return (error); } /* * Copy data in and out of the target process. * We do this by mapping the process's page into * the kernel and then doing a uiomove direct * from the kernel address space. */ int procfs_domem(curp, p, pfs, uio) struct proc *curp; struct proc *p; struct pfsnode *pfs; struct uio *uio; { if (uio->uio_resid == 0) return (0); /* * XXX * We need to check for KMEM_GROUP because ps is sgid kmem; * not allowing it here causes ps to not work properly. Arguably, * this is a bug with what ps does. We only need to do this * for Pmem nodes, and only if it's reading. This is still not * good, as it may still be possible to grab illicit data if * a process somehow gets to be KMEM_GROUP. Note that this also * means that KMEM_GROUP can't change without editing procfs.h! * All in all, quite yucky. */ if (!CHECKIO(curp, p) && !(uio->uio_rw == UIO_READ && procfs_kmemaccess(curp))) return EPERM; return (procfs_rwmem(curp, p, uio)); } /* * Given process (p), find the vnode from which * its text segment is being executed. * * It would be nice to grab this information from * the VM system, however, there is no sure-fire * way of doing that. Instead, fork(), exec() and * wait() all maintain the p_textvp field in the * process proc structure which contains a held * reference to the exec'ed vnode. */ struct vnode * procfs_findtextvp(p) struct proc *p; { return (p->p_textvp); } int procfs_kmemaccess(curp) struct proc *curp; { int i; struct ucred *cred; cred = curp->p_cred->pc_ucred; if (suser(cred, &curp->p_acflag)) return 1; for (i = 0; i < cred->cr_ngroups; i++) if (cred->cr_groups[i] == KMEM_GROUP) return 1; return 0; } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 40699) +++ head/sys/vm/vm_fault.c (revision 40700) @@ -1,1126 +1,1126 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_fault.c,v 1.88 1998/09/04 08:06:57 dfr Exp $ + * $Id: vm_fault.c,v 1.89 1998/10/25 17:44:58 phk Exp $ */ /* * Page fault handling module. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vm_fault_additional_pages __P((vm_page_t, int, int, vm_page_t *, int *)); #define VM_FAULT_READ_AHEAD 8 #define VM_FAULT_READ_BEHIND 7 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1) struct faultstate { vm_page_t m; vm_object_t object; vm_pindex_t pindex; vm_page_t first_m; vm_object_t first_object; vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; int lookup_still_valid; struct vnode *vp; }; static void release_page(struct faultstate *fs) { vm_page_wakeup(fs->m); vm_page_deactivate(fs->m); fs->m = NULL; } static void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = FALSE; } } static void _unlock_things(struct faultstate *fs, int dealloc) { vm_object_pip_wakeup(fs->object); if (fs->object != fs->first_object) { vm_page_free(fs->first_m); vm_object_pip_wakeup(fs->first_object); fs->first_m = NULL; } if (dealloc) { vm_object_deallocate(fs->first_object); } unlock_map(fs); if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } #define unlock_things(fs) _unlock_things(fs, 0) #define unlock_and_deallocate(fs) _unlock_things(fs, 1) /* * vm_fault: * * Handle a page fault occuring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) { vm_prot_t prot; int result; boolean_t wired; int map_generation; vm_page_t old_m; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ]; int hardfault; int faultcount; struct faultstate fs; cnt.v_vm_faults++; /* needs lock XXX */ hardfault = 0; RetryFault:; fs.map = map; /* * Find the backing store object and offset into it to begin the * search. */ if ((result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired)) != KERN_SUCCESS) { if ((result != KERN_PROTECTION_FAILURE) || ((fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE)) { return result; } /* * If we are user-wiring a r/w segment, and it is COW, then * we need to do the COW operation. Note that we don't COW * currently RO sections now, because it is NOT desirable * to COW .text. We simply keep .text from ever being COW'ed * and take the heat that one cannot debug wired .text sections. */ result = vm_map_lookup(&fs.map, vaddr, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { return result; } /* * If we don't COW now, on a user wire, the user will never * be able to write to the mapping. If we don't make this * restriction, the bookkeeping would be nearly impossible. */ if ((fs.entry->protection & VM_PROT_WRITE) == 0) fs.entry->max_protection &= ~VM_PROT_WRITE; } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. */ vm_object_reference(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.vp = vnode_pager_lock(fs.first_object); if ((fault_type & VM_PROT_WRITE) && (fs.first_object->type == OBJT_VNODE)) { vm_freeze_copyopts(fs.first_object, fs.first_pindex, fs.first_pindex + 1); } fs.lookup_still_valid = TRUE; if (wired) fault_type = prot; fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; /* * See whether this page is resident */ while (TRUE) { if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { int queue; /* * If the page is being brought in, wait for it and * then retry. */ if ((fs.m->flags & PG_BUSY) || (fs.m->busy && (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) { int s; unlock_things(&fs); s = splvm(); if ((fs.m->flags & PG_BUSY) || (fs.m->busy && (fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) { vm_page_flag_set(fs.m, PG_WANTED | PG_REFERENCED); cnt.v_intrans++; tsleep(fs.m, PSWP, "vmpfw", 0); } splx(s); vm_object_deallocate(fs.first_object); goto RetryFault; } queue = fs.m->queue; vm_page_unqueue_nowakeup(fs.m); /* * Mark page busy for other processes, and the pagedaemon. */ if (((queue - fs.m->pc) == PQ_CACHE) && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { vm_page_activate(fs.m); unlock_and_deallocate(&fs); VM_WAIT; goto RetryFault; } vm_page_busy(fs.m); if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) && fs.m->object != kernel_object && fs.m->object != kmem_object) { goto readrest; } break; } if (((fs.object->type != OBJT_DEFAULT) && (((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired)) || (fs.object == fs.first_object)) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. */ fs.m = vm_page_alloc(fs.object, fs.pindex, (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO); if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAIT; goto RetryFault; } } readrest: if (fs.object->type != OBJT_DEFAULT && (((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired)) { int rv; int reqpage; int ahead, behind; if (fs.first_object->behavior == OBJ_RANDOM) { ahead = 0; behind = 0; } else { behind = (vaddr - fs.entry->start) >> PAGE_SHIFT; if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = ((fs.entry->end - vaddr) >> PAGE_SHIFT) - 1; if (ahead > VM_FAULT_READ_AHEAD) ahead = VM_FAULT_READ_AHEAD; } if ((fs.first_object->type != OBJT_DEVICE) && (fs.first_object->behavior == OBJ_SEQUENTIAL)) { vm_pindex_t firstpindex, tmppindex; if (fs.first_pindex < 2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1)) firstpindex = 0; else firstpindex = fs.first_pindex - 2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1); for(tmppindex = fs.first_pindex - 1; tmppindex >= firstpindex; --tmppindex) { vm_page_t mt; mt = vm_page_lookup( fs.first_object, tmppindex); if (mt == NULL || (mt->valid != VM_PAGE_BITS_ALL)) break; if (mt->busy || (mt->flags & (PG_BUSY | PG_FICTITIOUS)) || mt->hold_count || mt->wire_count) continue; if (mt->dirty == 0) vm_page_test_dirty(mt); if (mt->dirty) { vm_page_protect(mt, VM_PROT_NONE); vm_page_deactivate(mt); } else { vm_page_cache(mt); } } ahead += behind; behind = 0; } /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. */ unlock_map(&fs); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if(!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_free(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_free(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) or the * pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; } fs.first_m = NULL; if ((fs.m->flags & PG_ZERO) == 0) { vm_page_zero_fill(fs.m); cnt.v_ozfod++; } cnt.v_zfod++; break; } else { if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); } fs.object = next_object; vm_object_pip_add(fs.object, 1); } } #if defined(DIAGNOSTIC) if ((fs.m->flags & PG_BUSY) == 0) panic("vm_fault: not busy after main loop"); #endif /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ old_m = fs.m; /* save page that would be copied */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if (fault_type & VM_PROT_WRITE) { /* * This allows pages to be virtually copied from a backing_object * into the first_object, where the backing object has no other * refs to it, and cannot gain any more refs. Instead of a * bcopy, we just move the page from the backing object to the * first object. Note that we must mark the page dirty in the * first object so that it will go out to swap when needed. */ if (map_generation == fs.map->timestamp && /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * Noone else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && /* * We don't chase down the shadow chain */ (fs.object == fs.first_object->backing_object) && /* * grab the lock if we need to */ (fs.lookup_still_valid || (((fs.entry->eflags & MAP_ENTRY_IS_A_MAP) == 0) && lockmgr(&fs.map->lock, LK_EXCLUSIVE|LK_NOWAIT, (void *)0, curproc) == 0))) { fs.lookup_still_valid = 1; /* * get rid of the unnecessary page */ vm_page_protect(fs.first_m, VM_PROT_NONE); vm_page_free(fs.first_m); fs.first_m = NULL; /* * grab the page and put it into the process'es object */ vm_page_rename(fs.m, fs.first_object, fs.first_pindex); fs.first_m = fs.m; fs.first_m->dirty = VM_PAGE_BITS_ALL; vm_page_busy(fs.first_m); fs.m = NULL; cnt.v_cow_optim++; } else { /* * Oh, well, lets copy it. */ vm_page_copy(fs.m, fs.first_m); } if (fs.m) { /* * We no longer need the old page or object. */ release_page(&fs); } vm_object_pip_wakeup(fs.object); /* * Only use the new page below... */ cnt.v_cow_faults++; fs.m = fs.first_m; fs.object = fs.first_object; fs.pindex = fs.first_pindex; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid && (fs.map->timestamp != map_generation)) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; /* * Since map entries may be pageable, make sure we can take a * page fault on them. */ /* * To avoid trying to write_lock the map while another process * has it read_locked (in vm_map_pageable), we do not try for * write permission. If the page is still writable, we will * get write permission. If it is not, or has been marked * needs_copy, we enter the mapping without write permission, * and will merely take another fault. */ result = vm_map_lookup(&fs.map, vaddr, fault_type & ~VM_PROT_WRITE, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); map_generation = fs.map->timestamp; /* * If we don't need the page any longer, put it on the active * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); return (result); } fs.lookup_still_valid = TRUE; if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter may cause other faults. We don't put the page * back on the active queue until later so that the page-out daemon * won't find us (yet). */ if (prot & VM_PROT_WRITE) { vm_page_flag_set(fs.m, PG_WRITEABLE); vm_object_set_flag(fs.m->object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); /* * If the fault is a write, we know that this page is being * written NOW. This will save on the pmap_is_modified() calls * later. */ if (fault_flags & VM_FAULT_DIRTY) { fs.m->dirty = VM_PAGE_BITS_ALL; } } unlock_things(&fs); fs.m->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(fs.m, PG_ZERO); pmap_enter(fs.map->pmap, vaddr, VM_PAGE_TO_PHYS(fs.m), prot, wired); if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) { pmap_prefault(fs.map->pmap, vaddr, fs.entry); } vm_page_flag_set(fs.m, PG_MAPPED|PG_REFERENCED); if (fault_flags & VM_FAULT_HOLD) vm_page_hold(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_WIRE_MASK) { if (wired) vm_page_wire(fs.m); else - vm_page_unwire(fs.m); + vm_page_unwire(fs.m, 1); } else { vm_page_activate(fs.m); } if (curproc && (curproc->p_flag & P_INMEM) && curproc->p_stats) { if (hardfault) { curproc->p_stats->p_ru.ru_majflt++; } else { curproc->p_stats->p_ru.ru_minflt++; } } /* * Unlock everything, and return */ vm_page_wakeup(fs.m); vm_object_deallocate(fs.first_object); return (KERN_SUCCESS); } /* * vm_fault_wire: * * Wire down a range of virtual addresses in a map. */ int vm_fault_wire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va; register pmap_t pmap; int rv; pmap = vm_map_pmap(map); /* * Inform the physical mapping system that the range of addresses may * not fault, so that page tables and such can be locked down as well. */ pmap_pageable(pmap, start, end, FALSE); /* * We simulate a fault to get the page and enter it in the physical * map. */ for (va = start; va < end; va += PAGE_SIZE) { rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_CHANGE_WIRING); if (rv) { if (va != start) vm_fault_unwire(map, start, va); return (rv); } } return (KERN_SUCCESS); } /* * vm_fault_user_wire: * * Wire down a range of virtual addresses in a map. This * is for user mode though, so we only ask for read access * on currently read only sections. */ int vm_fault_user_wire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va; register pmap_t pmap; int rv; pmap = vm_map_pmap(map); /* * Inform the physical mapping system that the range of addresses may * not fault, so that page tables and such can be locked down as well. */ pmap_pageable(pmap, start, end, FALSE); /* * We simulate a fault to get the page and enter it in the physical * map. */ for (va = start; va < end; va += PAGE_SIZE) { rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_USER_WIRE); if (rv) { if (va != start) vm_fault_unwire(map, start, va); return (rv); } } return (KERN_SUCCESS); } /* * vm_fault_unwire: * * Unwire a range of virtual addresses in a map. */ void vm_fault_unwire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va, pa; register pmap_t pmap; pmap = vm_map_pmap(map); /* * Since the pages are wired down, we must be able to get their * mappings from the physical map system. */ for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_extract(pmap, va); if (pa != (vm_offset_t) 0) { pmap_change_wiring(pmap, va, FALSE); - vm_page_unwire(PHYS_TO_VM_PAGE(pa)); + vm_page_unwire(PHYS_TO_VM_PAGE(pa), 1); } } /* * Inform the physical mapping system that the range of addresses may * fault, so that page tables and such may be unwired themselves. */ pmap_pageable(pmap, start, end, TRUE); } /* * Routine: * vm_fault_copy_entry * Function: * Copy all of the pages from a wired-down map entry to another. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) vm_map_t dst_map; vm_map_t src_map; vm_map_entry_t dst_entry; vm_map_entry_t src_entry; { vm_object_t dst_object; vm_object_t src_object; vm_ooffset_t dst_offset; vm_ooffset_t src_offset; vm_prot_t prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; #ifdef lint src_map++; #endif /* lint */ src_object = src_entry->object.vm_object; src_offset = src_entry->offset; /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, (vm_size_t) OFF_TO_IDX(dst_entry->end - dst_entry->start)); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; prot = dst_entry->max_protection; /* * Loop through all of the pages in the entry's range, copying each * one from the source object (it should be there) to the destination * object. */ for (vaddr = dst_entry->start, dst_offset = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { /* * Allocate a page in the destination object */ do { dst_m = vm_page_alloc(dst_object, OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_WAIT; } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be in * memory.) */ src_m = vm_page_lookup(src_object, OFF_TO_IDX(dst_offset + src_offset)); if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); vm_page_copy(src_m, dst_m); /* * Enter it in the pmap... */ vm_page_flag_clear(dst_m, PG_ZERO); pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m), prot, FALSE); vm_page_flag_set(dst_m, PG_WRITEABLE|PG_MAPPED); /* * Mark it no longer busy, and put it on the active list. */ vm_page_activate(dst_m); vm_page_wakeup(dst_m); } } /* * This routine checks around the requested page for other pages that * might be able to be faulted in. This routine brackets the viable * pages for the pages to be paged in. * * Inputs: * m, rbehind, rahead * * Outputs: * marray (array of vm_page_t), reqpage (index of requested page) * * Return value: * number of pages in marray */ static int vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) vm_page_t m; int rbehind; int rahead; vm_page_t *marray; int *reqpage; { int i,j; vm_object_t object; vm_pindex_t pindex, startpindex, endpindex, tpindex; vm_page_t rtm; int cbehind, cahead; object = m->object; pindex = m->pindex; /* * we don't fault-ahead for device pager */ if (object->type == OBJT_DEVICE) { *reqpage = 0; marray[0] = m; return 1; } /* * if the requested page is not available, then give up now */ if (!vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead)) { return 0; } if ((cbehind == 0) && (cahead == 0)) { *reqpage = 0; marray[0] = m; return 1; } if (rahead > cahead) { rahead = cahead; } if (rbehind > cbehind) { rbehind = cbehind; } /* * try to do any readahead that we might have free pages for. */ if ((rahead + rbehind) > ((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) { pagedaemon_wakeup(); marray[0] = m; *reqpage = 0; return 1; } /* * scan backward for the read behind pages -- in memory */ if (pindex > 0) { if (rbehind > pindex) { rbehind = pindex; startpindex = 0; } else { startpindex = pindex - rbehind; } for ( tpindex = pindex - 1; tpindex >= startpindex; tpindex -= 1) { if (vm_page_lookup( object, tpindex)) { startpindex = tpindex + 1; break; } if (tpindex == 0) break; } for(i = 0, tpindex = startpindex; tpindex < pindex; i++, tpindex++) { rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (rtm == NULL) { for (j = 0; j < i; j++) { vm_page_free(marray[j]); } marray[0] = m; *reqpage = 0; return 1; } marray[i] = rtm; } } else { startpindex = 0; i = 0; } marray[i] = m; /* page offset of the required page */ *reqpage = i; tpindex = pindex + 1; i++; /* * scan forward for the read ahead pages */ endpindex = tpindex + rahead; if (endpindex > object->size) endpindex = object->size; for( ; tpindex < endpindex; i++, tpindex++) { if (vm_page_lookup(object, tpindex)) { break; } rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (rtm == NULL) { break; } marray[i] = rtm; } /* return number of bytes of pages */ return i; } Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 40699) +++ head/sys/vm/vm_page.c (revision 40700) @@ -1,1746 +1,1754 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 - * $Id: vm_page.c,v 1.109 1998/10/21 14:46:41 dg Exp $ + * $Id: vm_page.c,v 1.110 1998/10/25 17:44:59 phk Exp $ */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void vm_page_queue_init __P((void)); static vm_page_t vm_page_select_free __P((vm_object_t object, vm_pindex_t pindex, int prefqueue)); static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t)); /* * Associated with page of user-allocatable memory is a * page structure. */ static struct pglist *vm_page_buckets; /* Array of buckets */ static int vm_page_bucket_count; /* How big is array? */ static int vm_page_hash_mask; /* Mask for hash function */ static volatile int vm_page_bucket_generation; struct pglist vm_page_queue_free[PQ_L2_SIZE] = {0}; struct pglist vm_page_queue_zero[PQ_L2_SIZE] = {0}; struct pglist vm_page_queue_active = {0}; struct pglist vm_page_queue_inactive = {0}; struct pglist vm_page_queue_cache[PQ_L2_SIZE] = {0}; static int no_queue=0; struct vpgqueues vm_page_queues[PQ_COUNT] = {0}; static int pqcnt[PQ_COUNT] = {0}; static void vm_page_queue_init(void) { int i; vm_page_queues[PQ_NONE].pl = NULL; vm_page_queues[PQ_NONE].cnt = &no_queue; for(i=0;i biggestsize) { biggestone = i; biggestsize = size; } ++nblocks; total += size; } start = phys_avail[biggestone]; /* * Initialize the queue headers for the free queue, the active queue * and the inactive queue. */ vm_page_queue_init(); /* * Allocate (and initialize) the hash table buckets. * * The number of buckets MUST BE a power of 2, and the actual value is * the next power of 2 greater than the number of physical pages in * the system. * * Note: This computation can be tweaked if desired. */ vm_page_buckets = (struct pglist *) vaddr; bucket = vm_page_buckets; if (vm_page_bucket_count == 0) { vm_page_bucket_count = 1; while (vm_page_bucket_count < atop(total)) vm_page_bucket_count <<= 1; } vm_page_hash_mask = vm_page_bucket_count - 1; /* * Validate these addresses. */ new_start = start + vm_page_bucket_count * sizeof(struct pglist); new_start = round_page(new_start); mapped = round_page(vaddr); vaddr = pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); start = new_start; vaddr = round_page(vaddr); bzero((caddr_t) mapped, vaddr - mapped); for (i = 0; i < vm_page_bucket_count; i++) { TAILQ_INIT(bucket); bucket++; } /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = phys_avail[0] / PAGE_SIZE; last_page = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE; page_range = last_page - (phys_avail[0] / PAGE_SIZE); npages = (total - (page_range * sizeof(struct vm_page)) - (start - phys_avail[biggestone])) / PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ vm_page_array = (vm_page_t) vaddr; mapped = vaddr; /* * Validate these addresses. */ new_start = round_page(start + page_range * sizeof(struct vm_page)); mapped = pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); start = new_start; first_managed_page = start / PAGE_SIZE; /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); vm_page_array_size = page_range; cnt.v_page_count = 0; cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { if (i == biggestone) pa = ptoa(first_managed_page); else pa = phys_avail[i]; while (pa < phys_avail[i + 1] && npages-- > 0) { ++cnt.v_page_count; ++cnt.v_free_count; m = PHYS_TO_VM_PAGE(pa); m->phys_addr = pa; m->flags = 0; m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK; m->queue = m->pc + PQ_FREE; TAILQ_INSERT_TAIL(vm_page_queues[m->queue].pl, m, pageq); ++(*vm_page_queues[m->queue].lcnt); pa += PAGE_SIZE; } } return (mapped); } /* * vm_page_hash: * * Distributes the object/offset key pair among hash buckets. * * NOTE: This macro depends on vm_page_bucket_count being a power of 2. */ static __inline int vm_page_hash(object, pindex) vm_object_t object; vm_pindex_t pindex; { return ((((uintptr_t) object) >> 5) + (pindex >> 1)) & vm_page_hash_mask; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object/object-page * table and object list. * * The object and page must be locked, and must be splhigh. */ void vm_page_insert(m, object, pindex) register vm_page_t m; register vm_object_t object; register vm_pindex_t pindex; { register struct pglist *bucket; if (m->object != NULL) panic("vm_page_insert: already inserted"); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Insert it into the object_object/offset hash table */ bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; TAILQ_INSERT_TAIL(bucket, m, hashq); vm_page_bucket_generation++; /* * Now link into the object's list of backed pages. */ TAILQ_INSERT_TAIL(&object->memq, m, listq); m->object->page_hint = m; m->object->generation++; if (m->wire_count) object->wire_count++; if ((m->queue - m->pc) == PQ_CACHE) object->cache_count++; /* * And show that the object has one more resident page. */ object->resident_page_count++; } /* * vm_page_remove: [ internal use only ] * NOTE: used by device pager as well -wfj * * Removes the given mem entry from the object/offset-page * table and the object page list. * * The object and page must be locked, and at splhigh. */ void vm_page_remove(m) register vm_page_t m; { register struct pglist *bucket; vm_object_t object; if (m->object == NULL) return; #if !defined(MAX_PERF) if ((m->flags & PG_BUSY) == 0) { panic("vm_page_remove: page not busy"); } #endif vm_page_flag_clear(m, PG_BUSY); if (m->flags & PG_WANTED) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } object = m->object; if (object->page_hint == m) object->page_hint = NULL; if (m->wire_count) object->wire_count--; if ((m->queue - m->pc) == PQ_CACHE) object->cache_count--; /* * Remove from the object_object/offset hash table */ bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)]; TAILQ_REMOVE(bucket, m, hashq); vm_page_bucket_generation++; /* * Now remove from the object's list of backed pages. */ TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; object->generation++; m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. No side effects. */ vm_page_t vm_page_lookup(object, pindex) register vm_object_t object; register vm_pindex_t pindex; { register vm_page_t m; register struct pglist *bucket; int generation; /* * Search the hash table for this object/offset pair */ if (object->page_hint && (object->page_hint->pindex == pindex) && (object->page_hint->object == object)) return object->page_hint; retry: generation = vm_page_bucket_generation; bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; for (m = TAILQ_FIRST(bucket); m != NULL; m = TAILQ_NEXT(m,hashq)) { if ((m->object == object) && (m->pindex == pindex)) { if (vm_page_bucket_generation != generation) goto retry; m->object->page_hint = m; return (m); } } if (vm_page_bucket_generation != generation) goto retry; return (NULL); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * The object must be locked. */ void vm_page_rename(m, new_object, new_pindex) register vm_page_t m; register vm_object_t new_object; vm_pindex_t new_pindex; { int s; s = splvm(); vm_page_remove(m); vm_page_insert(m, new_object, new_pindex); splx(s); } /* * vm_page_unqueue without any wakeup */ void vm_page_unqueue_nowakeup(m) vm_page_t m; { int queue = m->queue; struct vpgqueues *pq; if (queue != PQ_NONE) { pq = &vm_page_queues[queue]; m->queue = PQ_NONE; TAILQ_REMOVE(pq->pl, m, pageq); (*pq->cnt)--; (*pq->lcnt)--; if ((queue - m->pc) == PQ_CACHE) { if (m->object) m->object->cache_count--; } } } /* * vm_page_unqueue must be called at splhigh(); */ void vm_page_unqueue(m) vm_page_t m; { int queue = m->queue; struct vpgqueues *pq; if (queue != PQ_NONE) { m->queue = PQ_NONE; pq = &vm_page_queues[queue]; TAILQ_REMOVE(pq->pl, m, pageq); (*pq->cnt)--; (*pq->lcnt)--; if ((queue - m->pc) == PQ_CACHE) { if ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_free_reserved + cnt.v_cache_min)) pagedaemon_wakeup(); if (m->object) m->object->cache_count--; } } } /* * Find a page on the specified queue with color optimization. */ vm_page_t vm_page_list_find(basequeue, index) int basequeue, index; { #if PQ_L2_SIZE > 1 int i,j; vm_page_t m; int hindex; struct vpgqueues *pq; pq = &vm_page_queues[basequeue]; m = TAILQ_FIRST(pq[index].pl); if (m) return m; for(j = 0; j < PQ_L1_SIZE; j++) { int ij; for(i = (PQ_L2_SIZE / 2) - PQ_L1_SIZE; (ij = i + j) > 0; i -= PQ_L1_SIZE) { hindex = index + ij; if (hindex >= PQ_L2_SIZE) hindex -= PQ_L2_SIZE; if (m = TAILQ_FIRST(pq[hindex].pl)) return m; hindex = index - ij; if (hindex < 0) hindex += PQ_L2_SIZE; if (m = TAILQ_FIRST(pq[hindex].pl)) return m; } } hindex = index + PQ_L2_SIZE / 2; if (hindex >= PQ_L2_SIZE) hindex -= PQ_L2_SIZE; m = TAILQ_FIRST(pq[hindex].pl); if (m) return m; return NULL; #else return TAILQ_FIRST(vm_page_queues[basequeue].pl); #endif } /* * Find a page on the specified queue with color optimization. */ vm_page_t vm_page_select(object, pindex, basequeue) vm_object_t object; vm_pindex_t pindex; int basequeue; { #if PQ_L2_SIZE > 1 int index; index = (pindex + object->pg_color) & PQ_L2_MASK; return vm_page_list_find(basequeue, index); #else return TAILQ_FIRST(vm_page_queues[basequeue].pl); #endif } /* * Find a page on the cache queue with color optimization. As pages * might be found, but not applicable, they are deactivated. This * keeps us from using potentially busy cached pages. */ vm_page_t vm_page_select_cache(object, pindex) vm_object_t object; vm_pindex_t pindex; { vm_page_t m; while (TRUE) { #if PQ_L2_SIZE > 1 int index; index = (pindex + object->pg_color) & PQ_L2_MASK; m = vm_page_list_find(PQ_CACHE, index); #else m = TAILQ_FIRST(vm_page_queues[PQ_CACHE].pl); #endif if (m && ((m->flags & PG_BUSY) || m->busy || m->hold_count || m->wire_count)) { vm_page_deactivate(m); continue; } return m; } } /* * Find a free or zero page, with specified preference. */ static vm_page_t vm_page_select_free(object, pindex, prefqueue) vm_object_t object; vm_pindex_t pindex; int prefqueue; { #if PQ_L2_SIZE > 1 int i,j; int index, hindex; #endif vm_page_t m, mh; int oqueuediff; struct vpgqueues *pq; if (prefqueue == PQ_ZERO) oqueuediff = PQ_FREE - PQ_ZERO; else oqueuediff = PQ_ZERO - PQ_FREE; if (mh = object->page_hint) { if (mh->pindex == (pindex - 1)) { if ((mh->flags & PG_FICTITIOUS) == 0) { if ((mh < &vm_page_array[cnt.v_page_count-1]) && (mh >= &vm_page_array[0])) { int queue; m = mh + 1; if (VM_PAGE_TO_PHYS(m) == (VM_PAGE_TO_PHYS(mh) + PAGE_SIZE)) { queue = m->queue - m->pc; if (queue == PQ_FREE || queue == PQ_ZERO) { return m; } } } } } } pq = &vm_page_queues[prefqueue]; #if PQ_L2_SIZE > 1 index = (pindex + object->pg_color) & PQ_L2_MASK; if (m = TAILQ_FIRST(pq[index].pl)) return m; if (m = TAILQ_FIRST(pq[index + oqueuediff].pl)) return m; for(j = 0; j < PQ_L1_SIZE; j++) { int ij; for(i = (PQ_L2_SIZE / 2) - PQ_L1_SIZE; (ij = i + j) >= 0; i -= PQ_L1_SIZE) { hindex = index + ij; if (hindex >= PQ_L2_SIZE) hindex -= PQ_L2_SIZE; if (m = TAILQ_FIRST(pq[hindex].pl)) return m; if (m = TAILQ_FIRST(pq[hindex + oqueuediff].pl)) return m; hindex = index - ij; if (hindex < 0) hindex += PQ_L2_SIZE; if (m = TAILQ_FIRST(pq[hindex].pl)) return m; if (m = TAILQ_FIRST(pq[hindex + oqueuediff].pl)) return m; } } hindex = index + PQ_L2_SIZE / 2; if (hindex >= PQ_L2_SIZE) hindex -= PQ_L2_SIZE; if (m = TAILQ_FIRST(pq[hindex].pl)) return m; if (m = TAILQ_FIRST(pq[hindex+oqueuediff].pl)) return m; #else if (m = TAILQ_FIRST(pq[0].pl)) return m; else return TAILQ_FIRST(pq[oqueuediff].pl); #endif return NULL; } /* * vm_page_alloc: * * Allocate and return a memory cell associated * with this VM object/offset pair. * * page_req classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * VM_ALLOC_ZERO zero page * * Object must be locked. */ vm_page_t vm_page_alloc(object, pindex, page_req) vm_object_t object; vm_pindex_t pindex; int page_req; { register vm_page_t m; struct vpgqueues *pq; vm_object_t oldobject; int queue, qtype; int s; #ifdef DIAGNOSTIC m = vm_page_lookup(object, pindex); if (m) panic("vm_page_alloc: page already allocated"); #endif if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { page_req = VM_ALLOC_SYSTEM; }; s = splvm(); switch (page_req) { case VM_ALLOC_NORMAL: if (cnt.v_free_count >= cnt.v_free_reserved) { m = vm_page_select_free(object, pindex, PQ_FREE); #if defined(DIAGNOSTIC) if (m == NULL) panic("vm_page_alloc(NORMAL): missing page on free queue\n"); #endif } else { m = vm_page_select_cache(object, pindex); if (m == NULL) { splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif vm_pageout_deficit++; pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_ZERO: if (cnt.v_free_count >= cnt.v_free_reserved) { m = vm_page_select_free(object, pindex, PQ_ZERO); #if defined(DIAGNOSTIC) if (m == NULL) panic("vm_page_alloc(ZERO): missing page on free queue\n"); #endif } else { m = vm_page_select_cache(object, pindex); if (m == NULL) { splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(ZERO): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif vm_pageout_deficit++; pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_SYSTEM: if ((cnt.v_free_count >= cnt.v_free_reserved) || ((cnt.v_cache_count == 0) && (cnt.v_free_count >= cnt.v_interrupt_free_min))) { m = vm_page_select_free(object, pindex, PQ_FREE); #if defined(DIAGNOSTIC) if (m == NULL) panic("vm_page_alloc(SYSTEM): missing page on free queue\n"); #endif } else { m = vm_page_select_cache(object, pindex); if (m == NULL) { splx(s); #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(SYSTEM): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif vm_pageout_deficit++; pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_INTERRUPT: if (cnt.v_free_count > 0) { m = vm_page_select_free(object, pindex, PQ_FREE); #if defined(DIAGNOSTIC) if (m == NULL) panic("vm_page_alloc(INTERRUPT): missing page on free queue\n"); #endif } else { splx(s); vm_pageout_deficit++; pagedaemon_wakeup(); return (NULL); } break; default: m = NULL; #if !defined(MAX_PERF) panic("vm_page_alloc: invalid allocation class"); #endif } queue = m->queue; qtype = queue - m->pc; if (qtype == PQ_ZERO) vm_page_zero_count--; pq = &vm_page_queues[queue]; TAILQ_REMOVE(pq->pl, m, pageq); (*pq->cnt)--; (*pq->lcnt)--; oldobject = NULL; if (qtype == PQ_ZERO) { m->flags = PG_ZERO | PG_BUSY; } else if (qtype == PQ_CACHE) { oldobject = m->object; vm_page_busy(m); vm_page_remove(m); m->flags = PG_BUSY; } else { m->flags = PG_BUSY; } m->wire_count = 0; m->hold_count = 0; m->act_count = 0; m->busy = 0; m->valid = 0; m->dirty = 0; m->queue = PQ_NONE; /* XXX before splx until vm_page_insert is safe */ vm_page_insert(m, object, pindex); /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_reserved + cnt.v_cache_min)) || (cnt.v_free_count < cnt.v_pageout_free_min)) pagedaemon_wakeup(); if ((qtype == PQ_CACHE) && ((page_req == VM_ALLOC_NORMAL) || (page_req == VM_ALLOC_ZERO)) && oldobject && (oldobject->type == OBJT_VNODE) && ((oldobject->flags & OBJ_DEAD) == 0)) { struct vnode *vp; vp = (struct vnode *) oldobject->handle; if (vp && VSHOULDFREE(vp)) { if ((vp->v_flag & (VFREE|VTBFREE|VDOOMED)) == 0) { TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist); vp->v_flag |= VTBFREE; } } } splx(s); return (m); } void vm_wait() { int s; s = splvm(); if (curproc == pageproc) { vm_pageout_pages_needed = 1; tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0); } else { if (!vm_pages_needed) { vm_pages_needed++; wakeup(&vm_pages_needed); } tsleep(&cnt.v_free_count, PVM, "vmwait", 0); } splx(s); } int vm_page_sleep(vm_page_t m, char *msg, char *busy) { int slept = 0; if ((busy && *busy) || (m->flags & PG_BUSY)) { int s; s = splvm(); if ((busy && *busy) || (m->flags & PG_BUSY)) { vm_page_flag_set(m, PG_WANTED); tsleep(m, PVM, msg, 0); slept = 1; } splx(s); } return slept; } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * * The page queues must be locked. */ void vm_page_activate(m) register vm_page_t m; { int s; s = splvm(); if (m->queue != PQ_ACTIVE) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_page_unqueue(m); if (m->wire_count == 0) { m->queue = PQ_ACTIVE; ++(*vm_page_queues[PQ_ACTIVE].lcnt); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; cnt.v_active_count++; } } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } splx(s); } /* * helper routine for vm_page_free and vm_page_free_zero */ static int vm_page_freechk_and_unqueue(m) vm_page_t m; { vm_object_t oldobject; oldobject = m->object; #if !defined(MAX_PERF) if (m->busy || ((m->queue - m->pc) == PQ_FREE) || (m->hold_count != 0)) { printf( "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n", (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0, m->hold_count); if ((m->queue - m->pc) == PQ_FREE) panic("vm_page_free: freeing free page"); else panic("vm_page_free: freeing busy page"); } #endif vm_page_unqueue_nowakeup(m); vm_page_remove(m); if ((m->flags & PG_FICTITIOUS) != 0) { return 0; } m->valid = 0; if (m->wire_count != 0) { #if !defined(MAX_PERF) if (m->wire_count > 1) { panic("vm_page_free: invalid wire count (%d), pindex: 0x%x", m->wire_count, m->pindex); } #endif printf("vm_page_free: freeing wired page\n"); m->wire_count = 0; if (m->object) m->object->wire_count--; cnt.v_wire_count--; } if (oldobject && (oldobject->type == OBJT_VNODE) && ((oldobject->flags & OBJ_DEAD) == 0)) { struct vnode *vp; vp = (struct vnode *) oldobject->handle; if (vp && VSHOULDFREE(vp)) { if ((vp->v_flag & (VTBFREE|VDOOMED|VFREE)) == 0) { TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist); vp->v_flag |= VTBFREE; } } } #ifdef __alpha__ pmap_page_is_free(m); #endif return 1; } /* * helper routine for vm_page_free and vm_page_free_zero */ static __inline void vm_page_free_wakeup() { /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && ((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) { wakeup(&cnt.v_free_count); vm_pages_needed = 0; } } /* * vm_page_free: * * Returns the given page to the free list, * disassociating it with any VM object. * * Object and page must be locked prior to entry. */ void vm_page_free(m) register vm_page_t m; { int s; struct vpgqueues *pq; s = splvm(); cnt.v_tfree++; if (!vm_page_freechk_and_unqueue(m)) { splx(s); return; } m->queue = PQ_FREE + m->pc; pq = &vm_page_queues[m->queue]; ++(*pq->lcnt); ++(*pq->cnt); /* * If the pageout process is grabbing the page, it is likely * that the page is NOT in the cache. It is more likely that * the page will be partially in the cache if it is being * explicitly freed. */ if (curproc == pageproc) { TAILQ_INSERT_TAIL(pq->pl, m, pageq); } else { TAILQ_INSERT_HEAD(pq->pl, m, pageq); } vm_page_free_wakeup(); splx(s); } void vm_page_free_zero(m) register vm_page_t m; { int s; struct vpgqueues *pq; s = splvm(); cnt.v_tfree++; if (!vm_page_freechk_and_unqueue(m)) { splx(s); return; } m->queue = PQ_ZERO + m->pc; pq = &vm_page_queues[m->queue]; ++(*pq->lcnt); ++(*pq->cnt); TAILQ_INSERT_HEAD(pq->pl, m, pageq); ++vm_page_zero_count; vm_page_free_wakeup(); splx(s); } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * The page queues must be locked. */ void vm_page_wire(m) register vm_page_t m; { int s; if (m->wire_count == 0) { s = splvm(); vm_page_unqueue(m); splx(s); cnt.v_wire_count++; if (m->object) m->object->wire_count++; } (*vm_page_queues[PQ_NONE].lcnt)++; m->wire_count++; vm_page_flag_set(m, PG_MAPPED); } /* * vm_page_unwire: * * Release one wiring of this page, potentially * enabling it to be paged again. * * The page queues must be locked. */ void -vm_page_unwire(m) +vm_page_unwire(m, activate) register vm_page_t m; + int activate; { int s; s = splvm(); if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { if (m->object) m->object->wire_count--; cnt.v_wire_count--; - TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); - m->queue = PQ_ACTIVE; - (*vm_page_queues[PQ_ACTIVE].lcnt)++; - cnt.v_active_count++; + if (activate) { + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + m->queue = PQ_ACTIVE; + (*vm_page_queues[PQ_ACTIVE].lcnt)++; + cnt.v_active_count++; + } else { + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + m->queue = PQ_INACTIVE; + (*vm_page_queues[PQ_INACTIVE].lcnt)++; + cnt.v_inactive_count++; + } } } else { #if !defined(MAX_PERF) panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count); #endif } splx(s); } /* * vm_page_deactivate: * * Returns the given page to the inactive list, * indicating that no physical maps have access * to this page. [Used by the physical mapping system.] * * The page queues must be locked. */ void vm_page_deactivate(m) register vm_page_t m; { int s; /* * Only move active pages -- ignore locked or already inactive ones. * * XXX: sometimes we get pages which aren't wired down or on any queue - * we need to put them on the inactive queue also, otherwise we lose * track of them. Paul Mackerras (paulus@cs.anu.edu.au) 9-Jan-93. */ if (m->queue == PQ_INACTIVE) return; s = splvm(); if (m->wire_count == 0) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_page_unqueue(m); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); m->queue = PQ_INACTIVE; ++(*vm_page_queues[PQ_INACTIVE].lcnt); cnt.v_inactive_count++; } splx(s); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). */ void vm_page_cache(m) register vm_page_t m; { int s; #if !defined(MAX_PERF) if ((m->flags & PG_BUSY) || m->busy || m->wire_count) { printf("vm_page_cache: attempting to cache busy page\n"); return; } #endif if ((m->queue - m->pc) == PQ_CACHE) return; vm_page_protect(m, VM_PROT_NONE); #if !defined(MAX_PERF) if (m->dirty != 0) { panic("vm_page_cache: caching a dirty page, pindex: %d", m->pindex); } #endif s = splvm(); vm_page_unqueue_nowakeup(m); m->queue = PQ_CACHE + m->pc; (*vm_page_queues[m->queue].lcnt)++; TAILQ_INSERT_TAIL(vm_page_queues[m->queue].pl, m, pageq); cnt.v_cache_count++; m->object->cache_count++; vm_page_free_wakeup(); splx(s); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, allocate it. */ vm_page_t vm_page_grab(object, pindex, allocflags) vm_object_t object; vm_pindex_t pindex; int allocflags; { vm_page_t m; int s, generation; retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { if (m->busy || (m->flags & PG_BUSY)) { generation = object->generation; s = splvm(); while ((object->generation == generation) && (m->busy || (m->flags & PG_BUSY))) { vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); tsleep(m, PVM, "pgrbwt", 0); if ((allocflags & VM_ALLOC_RETRY) == 0) { splx(s); return NULL; } } splx(s); goto retrylookup; } else { vm_page_busy(m); return m; } } m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY); if (m == NULL) { VM_WAIT; if ((allocflags & VM_ALLOC_RETRY) == 0) return NULL; goto retrylookup; } return m; } /* * mapping function for valid bits or for dirty bits in * a page */ __inline int vm_page_bits(int base, int size) { u_short chunk; if ((base == 0) && (size >= PAGE_SIZE)) return VM_PAGE_BITS_ALL; size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); base &= PAGE_MASK; if (size > PAGE_SIZE - base) { size = PAGE_SIZE - base; } base = base / DEV_BSIZE; chunk = vm_page_dev_bsize_chunks[size / DEV_BSIZE]; return (chunk << base) & VM_PAGE_BITS_ALL; } /* * set a page valid and clean */ void vm_page_set_validclean(m, base, size) vm_page_t m; int base; int size; { int pagebits = vm_page_bits(base, size); m->valid |= pagebits; m->dirty &= ~pagebits; if( base == 0 && size == PAGE_SIZE) pmap_clear_modify(VM_PAGE_TO_PHYS(m)); } /* * set a page (partially) invalid */ void vm_page_set_invalid(m, base, size) vm_page_t m; int base; int size; { int bits; m->valid &= ~(bits = vm_page_bits(base, size)); if (m->valid == 0) m->dirty &= ~bits; m->object->generation++; } /* * is (partial) page valid? */ int vm_page_is_valid(m, base, size) vm_page_t m; int base; int size; { int bits = vm_page_bits(base, size); if (m->valid && ((m->valid & bits) == bits)) return 1; else return 0; } void vm_page_test_dirty(m) vm_page_t m; { if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(VM_PAGE_TO_PHYS(m))) { m->dirty = VM_PAGE_BITS_ALL; } } /* * This interface is for merging with malloc() someday. * Even if we never implement compaction so that contiguous allocation * works after initialization time, malloc()'s data structures are good * for statistics and for allocations of less than a page. */ void * contigmalloc1(size, type, flags, low, high, alignment, boundary, map) unsigned long size; /* should be size_t here and for malloc() */ struct malloc_type *type; int flags; unsigned long low; unsigned long high; unsigned long alignment; unsigned long boundary; vm_map_t map; { int i, s, start; vm_offset_t addr, phys, tmp_addr; int pass; vm_page_t pga = vm_page_array; size = round_page(size); #if !defined(MAX_PERF) if (size == 0) panic("contigmalloc1: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("contigmalloc1: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("contigmalloc1: boundary must be a power of 2"); #endif start = 0; for (pass = 0; pass <= 1; pass++) { s = splvm(); again: /* * Find first page in array that is free, within range, aligned, and * such that the boundary won't be crossed. */ for (i = start; i < cnt.v_page_count; i++) { int pqtype; phys = VM_PAGE_TO_PHYS(&pga[i]); pqtype = pga[i].queue - pga[i].pc; if (((pqtype == PQ_ZERO) || (pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) break; } /* * If the above failed or we will exceed the upper bound, fail. */ if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { vm_page_t m, next; again1: for (m = TAILQ_FIRST(&vm_page_queue_inactive); m != NULL; m = next) { if (m->queue != PQ_INACTIVE) { break; } next = TAILQ_NEXT(m, pageq); if (vm_page_sleep(m, "vpctw0", &m->busy)) goto again1; vm_page_test_dirty(m); if (m->dirty) { if (m->object->type == OBJT_VNODE) { vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc); vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC); VOP_UNLOCK(m->object->handle, 0, curproc); goto again1; } else if (m->object->type == OBJT_SWAP || m->object->type == OBJT_DEFAULT) { vm_pageout_flush(&m, 1, 0); goto again1; } } if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0)) vm_page_cache(m); } for (m = TAILQ_FIRST(&vm_page_queue_active); m != NULL; m = next) { if (m->queue != PQ_ACTIVE) { break; } next = TAILQ_NEXT(m, pageq); if (vm_page_sleep(m, "vpctw1", &m->busy)) goto again1; vm_page_test_dirty(m); if (m->dirty) { if (m->object->type == OBJT_VNODE) { vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc); vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC); VOP_UNLOCK(m->object->handle, 0, curproc); goto again1; } else if (m->object->type == OBJT_SWAP || m->object->type == OBJT_DEFAULT) { vm_pageout_flush(&m, 1, 0); goto again1; } } if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0)) vm_page_cache(m); } splx(s); continue; } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { int pqtype; pqtype = pga[i].queue - pga[i].pc; if ((VM_PAGE_TO_PHYS(&pga[i]) != (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || ((pqtype != PQ_ZERO) && (pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) { start++; goto again; } } for (i = start; i < (start + size / PAGE_SIZE); i++) { int pqtype; vm_page_t m = &pga[i]; pqtype = m->queue - m->pc; if (pqtype == PQ_CACHE) { vm_page_busy(m); vm_page_free(m); } TAILQ_REMOVE(vm_page_queues[m->queue].pl, m, pageq); (*vm_page_queues[m->queue].lcnt)--; cnt.v_free_count--; m->valid = VM_PAGE_BITS_ALL; m->flags = 0; m->dirty = 0; m->wire_count = 0; m->busy = 0; m->queue = PQ_NONE; m->object = NULL; vm_page_wire(m); } /* * We've found a contiguous chunk that meets are requirements. * Allocate kernel VM, unfree and assign the physical pages to it and * return kernel VM pointer. */ tmp_addr = addr = kmem_alloc_pageable(map, size); if (addr == 0) { /* * XXX We almost never run out of kernel virtual * space, so we don't make the allocated memory * above available. */ splx(s); return (NULL); } for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_page_insert(m, kernel_object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m)); tmp_addr += PAGE_SIZE; } splx(s); return ((void *)addr); } return NULL; } void * contigmalloc(size, type, flags, low, high, alignment, boundary) unsigned long size; /* should be size_t here and for malloc() */ struct malloc_type *type; int flags; unsigned long low; unsigned long high; unsigned long alignment; unsigned long boundary; { return contigmalloc1(size, type, flags, low, high, alignment, boundary, kernel_map); } vm_offset_t vm_page_alloc_contig(size, low, high, alignment) vm_offset_t size; vm_offset_t low; vm_offset_t high; vm_offset_t alignment; { return ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high, alignment, 0ul, kernel_map)); } #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int i; db_printf("PQ_FREE:"); for(i=0;i #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several lists: * * A hash table bucket used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * Fields in this structure are locked either by the lock on the * object that the page belongs to (O) or by the lock on the page * queues (P). */ TAILQ_HEAD(pglist, vm_page); struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO queue or free list (P) */ TAILQ_ENTRY(vm_page) hashq; /* hash table links (O) */ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_offset_t phys_addr; /* physical address of page */ u_short queue; /* page queue index */ u_short flags, /* see below */ pc; /* page color */ u_short wire_count; /* wired down maps refs (P) */ short hold_count; /* page hold count */ u_char act_count; /* page usage count */ u_char busy; /* page busy count */ /* NOTE that these must support one bit per DEV_BSIZE in a page!!! */ /* so, on normal X86 kernels, they must be at least 8 bits wide */ #if PAGE_SIZE == 4096 u_char valid; /* map of valid DEV_BSIZE chunks */ u_char dirty; /* map of dirty DEV_BSIZE chunks */ #elif PAGE_SIZE == 8192 u_short valid; /* map of valid DEV_BSIZE chunks */ u_short dirty; /* map of dirty DEV_BSIZE chunks */ #endif }; /* * Page coloring parameters */ /* Each of PQ_FREE, PQ_ZERO and PQ_CACHE have PQ_HASH_SIZE entries */ /* Define one of the following */ #if defined(PQ_HUGECACHE) #define PQ_PRIME1 31 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 23 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME3 17 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 256 /* A number of colors opt for 1M cache */ #define PQ_L1_SIZE 4 /* Four page L1 cache */ #endif /* Define one of the following */ #if defined(PQ_LARGECACHE) #define PQ_PRIME1 31 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 23 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME3 17 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 128 /* A number of colors opt for 512K cache */ #define PQ_L1_SIZE 4 /* Four page L1 cache (for PII) */ #endif /* * Use 'options PQ_NOOPT' to disable page coloring */ #if defined(PQ_NOOPT) #define PQ_PRIME1 1 #define PQ_PRIME2 1 #define PQ_PRIME3 1 #define PQ_L2_SIZE 1 #define PQ_L1_SIZE 1 #endif #if defined(PQ_NORMALCACHE) #define PQ_PRIME1 5 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 3 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME3 11 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 16 /* A reasonable number of colors (opt for 64K cache) */ #define PQ_L1_SIZE 2 /* Two page L1 cache */ #endif #if defined(PQ_MEDIUMCACHE) || !defined(PQ_L2_SIZE) #define PQ_PRIME1 13 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME2 7 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_PRIME3 5 /* Prime number somewhat less than PQ_HASH_SIZE */ #define PQ_L2_SIZE 64 /* A number of colors opt for 256K cache */ #define PQ_L1_SIZE 2 /* Two page L1 cache */ #endif #define PQ_L2_MASK (PQ_L2_SIZE - 1) #define PQ_NONE 0 #define PQ_FREE 1 #define PQ_ZERO (1 + PQ_L2_SIZE) #define PQ_INACTIVE (1 + 2*PQ_L2_SIZE) #define PQ_ACTIVE (2 + 2*PQ_L2_SIZE) #define PQ_CACHE (3 + 2*PQ_L2_SIZE) #define PQ_COUNT (3 + 3*PQ_L2_SIZE) extern struct vpgqueues { struct pglist *pl; int *cnt; int *lcnt; } vm_page_queues[PQ_COUNT]; /* * These are the flags defined for vm_page. * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. */ #define PG_BUSY 0x01 /* page is in transit (O) */ #define PG_WANTED 0x02 /* someone is waiting for page (O) */ #define PG_FICTITIOUS 0x08 /* physical page doesn't exist (O) */ #define PG_WRITEABLE 0x10 /* page is mapped writeable */ #define PG_MAPPED 0x20 /* page is mapped */ #define PG_ZERO 0x40 /* page is zeroed */ #define PG_REFERENCED 0x80 /* page has been referenced */ #define PG_CLEANCHK 0x100 /* page will be checked for cleaning */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_INIT 5 #define ACT_MAX 64 #define PFCLUSTER_BEHIND 3 #define PFCLUSTER_AHEAD 3 #ifdef KERNEL /* * Each pageable resident page falls into one of four lists: * * free * Available for allocation now. * * The following are all LRU sorted: * * cache * Almost available for allocation. Still in an * object, but clean and immediately freeable at * non-interrupt times. * * inactive * Low activity, candidates for reclamation. * This is the list of pages that should be * paged out next. * * active * Pages that are "active" i.e. they have been * recently referenced. * * zero * Pages that are really free and have been pre-zeroed * */ extern struct pglist vm_page_queue_free[PQ_L2_SIZE];/* memory free queue */ extern struct pglist vm_page_queue_zero[PQ_L2_SIZE];/* zeroed memory free queue */ extern struct pglist vm_page_queue_active; /* active memory queue */ extern struct pglist vm_page_queue_inactive; /* inactive memory queue */ extern struct pglist vm_page_queue_cache[PQ_L2_SIZE];/* cache memory queue */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern long first_page; /* first physical page number */ /* ... represented in vm_page_array */ extern long last_page; /* last physical page number */ /* ... represented in vm_page_array */ /* [INCLUSIVE] */ extern vm_offset_t first_phys_addr; /* physical address for first_page */ extern vm_offset_t last_phys_addr; /* physical address for last_page */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) #define IS_VM_PHYSADDR(pa) \ ((pa) >= first_phys_addr && (pa) <= last_phys_addr) #define PHYS_TO_VM_PAGE(pa) \ (&vm_page_array[atop(pa) - first_page ]) /* * Functions implemented as macros */ static __inline void vm_page_flag_set(vm_page_t m, unsigned int bits) { atomic_set_short(&(m)->flags, bits); } static __inline void vm_page_flag_clear(vm_page_t m, unsigned int bits) { atomic_clear_short(&(m)->flags, bits); } #if 0 static __inline void vm_page_assert_wait(vm_page_t m, int interruptible) { vm_page_flag_set(m, PG_WANTED); assert_wait((int) m, interruptible); } #endif static __inline void vm_page_busy(vm_page_t m) { vm_page_flag_set(m, PG_BUSY); } static __inline void vm_page_wakeup(vm_page_t m) { vm_page_flag_clear(m, PG_BUSY); if (m->flags & PG_WANTED) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } } static __inline void vm_page_io_start(vm_page_t m) { atomic_add_char(&(m)->busy, 1); } static __inline void vm_page_io_finish(vm_page_t m) { atomic_subtract_char(&m->busy, 1); if ((m->flags & PG_WANTED) && m->busy == 0) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } } #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xff #endif #if PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffff #endif #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 #define VM_ALLOC_ZERO 3 #define VM_ALLOC_RETRY 0x80 void vm_page_activate __P((vm_page_t)); vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); void vm_page_deactivate __P((vm_page_t)); void vm_page_free __P((vm_page_t)); void vm_page_free_zero __P((vm_page_t)); void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t)); void vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); -void vm_page_unwire __P((vm_page_t)); +void vm_page_unwire __P((vm_page_t, int)); void vm_page_wire __P((vm_page_t)); void vm_page_unqueue __P((vm_page_t)); void vm_page_unqueue_nowakeup __P((vm_page_t)); void vm_page_set_validclean __P((vm_page_t, int, int)); void vm_page_set_invalid __P((vm_page_t, int, int)); static __inline boolean_t vm_page_zero_fill __P((vm_page_t)); int vm_page_is_valid __P((vm_page_t, int, int)); void vm_page_test_dirty __P((vm_page_t)); int vm_page_bits __P((int, int)); vm_page_t vm_page_list_find __P((int, int)); int vm_page_queue_index __P((vm_offset_t, int)); vm_page_t vm_page_select __P((vm_object_t, vm_pindex_t, int)); int vm_page_sleep(vm_page_t m, char *msg, char *busy); /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ static __inline void vm_page_hold(vm_page_t mem) { mem->hold_count++; } static __inline void vm_page_unhold(vm_page_t mem) { #ifdef DIAGNOSTIC if (--mem->hold_count < 0) panic("vm_page_unhold: hold count < 0!!!"); #else --mem->hold_count; #endif } static __inline void vm_page_protect(vm_page_t mem, int prot) { if (prot == VM_PROT_NONE) { if (mem->flags & (PG_WRITEABLE|PG_MAPPED)) { pmap_page_protect(VM_PAGE_TO_PHYS(mem), VM_PROT_NONE); vm_page_flag_clear(mem, PG_WRITEABLE|PG_MAPPED); } } else if ((prot == VM_PROT_READ) && (mem->flags & PG_WRITEABLE)) { pmap_page_protect(VM_PAGE_TO_PHYS(mem), VM_PROT_READ); vm_page_flag_clear(mem, PG_WRITEABLE); } } /* * vm_page_zero_fill: * * Zero-fill the specified page. * Written as a standard pagein routine, to * be used by the zero-fill object. */ static __inline boolean_t vm_page_zero_fill(m) vm_page_t m; { pmap_zero_page(VM_PAGE_TO_PHYS(m)); return (TRUE); } /* * vm_page_copy: * * Copy one page to another */ static __inline void vm_page_copy(src_m, dest_m) vm_page_t src_m; vm_page_t dest_m; { pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); dest_m->valid = VM_PAGE_BITS_ALL; } #endif /* KERNEL */ #endif /* !_VM_PAGE_ */