Index: head/sys/alpha/alpha/pmap.c =================================================================== --- head/sys/alpha/alpha/pmap.c (revision 132413) +++ head/sys/alpha/alpha/pmap.c (revision 132414) @@ -1,2832 +1,2750 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 1998 Doug Rabson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp * with some ideas from NetBSD's alpha pmap */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ /* * Notes for alpha pmap. * * On alpha, pm_pdeobj will hold lev1, lev2 and lev3 page tables. * Indices from 0 to NUSERLEV3MAPS-1 will map user lev3 page tables, * indices from NUSERLEV3MAPS to NUSERLEV3MAPS+NUSERLEV2MAPS-1 will * map user lev2 page tables and index NUSERLEV3MAPS+NUSERLEV2MAPS * will map the lev1 page table. The lev1 table will self map at * address VADDR(PTLEV1I,0,0). * * The vm_object kptobj holds the kernel page tables on i386 (62 or 63 * of them, depending on whether the system is SMP). On alpha, kptobj * will hold the lev3 and lev2 page tables for K1SEG. Indices 0 to * NKLEV3MAPS-1 will map kernel lev3 page tables and indices * NKLEV3MAPS to NKLEV3MAPS+NKLEV2MAPS will map lev2 page tables. (XXX * should the kernel Lev1map be inserted into this object?). * * pvtmmap is not needed for alpha since K0SEG maps all of physical * memory. * * * alpha virtual memory map: * * * Address Lev1 index * * --------------------------------- * 0000000000000000 | | 0 * | | * | | * | | * | | * --- --- * User space (USEG) * --- --- * | | * | | * | | * | | * 000003ffffffffff | | 511=UMAXLEV1I * --------------------------------- * fffffc0000000000 | | 512=K0SEGLEV1I * | Kernel code/data/bss | * | | * | | * | | * --- --- * K0SEG * --- --- * | | * | 1-1 physical/virtual | * | | * | | * fffffdffffffffff | | * --------------------------------- * fffffe0000000000 | | 768=K1SEGLEV1I * | Kernel dynamic data | * | | * | | * | | * --- --- * K1SEG * --- --- * | | * | mapped by ptes | * | | * | | * fffffff7ffffffff | | * --------------------------------- * fffffffe00000000 | | 1023=PTLEV1I * | PTmap (pte self map) | * ffffffffffffffff | | * --------------------------------- * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if 0 #define PMAP_DIAGNOSTIC #define PMAP_DEBUG #endif #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Some macros for manipulating virtual addresses */ #define ALPHA_L1SIZE (1L << ALPHA_L1SHIFT) #define ALPHA_L2SIZE (1L << ALPHA_L2SHIFT) #define alpha_l1trunc(va) ((va) & ~(ALPHA_L1SIZE-1)) #define alpha_l2trunc(va) ((va) & ~(ALPHA_L2SIZE-1)) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pte_w(pte) ((*(pte) & PG_W) != 0) #define pmap_pte_managed(pte) ((*(pte) & PG_MANAGED) != 0) #define pmap_pte_v(pte) ((*(pte) & PG_V) != 0) #define pmap_pte_pa(pte) alpha_ptob(ALPHA_PTE_TO_PFN(*(pte))) #define pmap_pte_prot(pte) (*(pte) & PG_PROT) #define pmap_pte_set_w(pte, v) ((v)?(*pte |= PG_W):(*pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*pte &= ~PG_PROT), (*pte |= (v))) /* * Given a map and a machine independent protection code, * convert to an alpha protection code. */ #define pte_prot(m, p) (protection_codes[m == kernel_pmap ? 0 : 1][p]) int protection_codes[2][8]; /* * Return non-zero if this pmap is currently active */ #define pmap_isactive(pmap) (pmap->pm_active) /* * Extract level 1, 2 and 3 page table indices from a va */ #define PTMASK ((1 << ALPHA_PTSHIFT) - 1) #define pmap_lev1_index(va) (((va) >> ALPHA_L1SHIFT) & PTMASK) #define pmap_lev2_index(va) (((va) >> ALPHA_L2SHIFT) & PTMASK) #define pmap_lev3_index(va) (((va) >> ALPHA_L3SHIFT) & PTMASK) /* * Given a physical address, construct a pte */ #define pmap_phys_to_pte(pa) ALPHA_PTE_FROM_PFN(alpha_btop(pa)) /* * Given a page frame number, construct a k0seg va */ #define pmap_k0seg_to_pfn(va) alpha_btop(ALPHA_K0SEG_TO_PHYS(va)) /* * Given a pte, construct a k0seg va */ #define pmap_k0seg_to_pte(va) ALPHA_PTE_FROM_PFN(pmap_k0seg_to_pfn(va)) /* * Lev1map: * * Kernel level 1 page table. This maps all kernel level 2 * page table pages, and is used as a template for all user * pmap level 1 page tables. When a new user level 1 page * table is allocated, all Lev1map PTEs for kernel addresses * are copied to the new map. * * Lev2map: * * Initial set of kernel level 2 page table pages. These * map the kernel level 3 page table pages. As kernel * level 3 page table pages are added, more level 2 page * table pages may be added to map them. These pages are * never freed. * * Lev3map: * * Initial set of kernel level 3 page table pages. These * map pages in K1SEG. More level 3 page table pages may * be added at run-time if additional K1SEG address space * is required. These pages are never freed. * * Lev2mapsize: * * Number of entries in the initial Lev2map. * * Lev3mapsize: * * Number of entries in the initial Lev3map. * * NOTE: When mappings are inserted into the kernel pmap, all * level 2 and level 3 page table pages must already be allocated * and mapped into the parent page table. */ pt_entry_t *Lev1map, *Lev2map, *Lev3map; vm_size_t Lev2mapsize, Lev3mapsize; /* * Statically allocated kernel pmap */ struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int nklev3, nklev2; vm_offset_t kernel_vm_end; /* * Data for the ASN allocator */ static int pmap_maxasn; static pmap_t pmap_active[MAXCPU]; static LIST_HEAD(,pmap) allpmaps; static struct mtx allpmaps_lock; /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void alpha_protection_init(void); static void pmap_changebit(vm_page_t m, int bit, boolean_t setem); static int pmap_remove_pte(pmap_t pmap, pt_entry_t* ptq, vm_offset_t sva); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); -static int pmap_release_free_page(pmap_t pmap, vm_page_t p); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t); #ifdef SMP static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_all_action(void *arg); #endif /* * Routine: pmap_lev1pte * Function: * Extract the level 1 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev1pte(pmap_t pmap, vm_offset_t va) { if (!pmap) return 0; return &pmap->pm_lev1[pmap_lev1_index(va)]; } /* * Routine: pmap_lev2pte * Function: * Extract the level 2 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev2pte(pmap_t pmap, vm_offset_t va) { pt_entry_t* l1pte; pt_entry_t* l2map; l1pte = pmap_lev1pte(pmap, va); if (!pmap_pte_v(l1pte)) return 0; l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l1pte)); return &l2map[pmap_lev2_index(va)]; } /* * Routine: pmap_lev3pte * Function: * Extract the level 3 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev3pte(pmap_t pmap, vm_offset_t va) { pt_entry_t* l2pte; pt_entry_t* l3map; l2pte = pmap_lev2pte(pmap, va); if (!l2pte || !pmap_pte_v(l2pte)) return 0; l3map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l2pte)); return &l3map[pmap_lev3_index(va)]; } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_size_t bank_size; vm_offset_t pa, va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i+2]; i+= 2) { phys_avail[i] = phys_avail[i+2]; phys_avail[i+1] = phys_avail[i+3]; } phys_avail[i] = 0; phys_avail[i+1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; va = ALPHA_PHYS_TO_K0SEG(pa); bzero((caddr_t) va, size); return va; } extern pt_entry_t rom_pte; /* XXX */ extern int prom_mapped; /* XXX */ /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t ptaddr, u_int maxasn) { pt_entry_t newpte; int i; /* * Setup ASNs. PCPU_GET(next_asn) and PCPU_GET(current_asngen) are set * up already. */ pmap_maxasn = maxasn; /* * Allocate a level 1 map for the kernel. */ Lev1map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE); /* * Allocate a level 2 map for the kernel */ Lev2map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE); Lev2mapsize = PAGE_SIZE; /* * Allocate some level 3 maps for the kernel */ Lev3map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE*NKPT); Lev3mapsize = NKPT * PAGE_SIZE; /* Map all of the level 2 maps */ for (i = 0; i < howmany(Lev2mapsize, PAGE_SIZE); i++) { unsigned long pfn = pmap_k0seg_to_pfn((vm_offset_t) Lev2map) + i; newpte = ALPHA_PTE_FROM_PFN(pfn); newpte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_W; Lev1map[K1SEGLEV1I + i] = newpte; } /* Setup the mapping for the prom console */ { if (pmap_uses_prom_console()) { /* XXX save old pte so that we can remap prom if necessary */ rom_pte = *(pt_entry_t *)ptaddr & ~PG_ASM; /* XXX */ } prom_mapped = 0; /* * Actually, this code lies. The prom is still mapped, and will * remain so until the context switch after alpha_init() returns. * Printfs using the firmware before then will end up frobbing * Lev1map unnecessarily, but that's OK. */ } /* * Level 1 self mapping. * * Don't set PG_ASM since the self-mapping is different for each * address space. */ newpte = pmap_k0seg_to_pte((vm_offset_t) Lev1map); newpte |= PG_V | PG_KRE | PG_KWE; Lev1map[PTLEV1I] = newpte; /* Map all of the level 3 maps */ for (i = 0; i < howmany(Lev3mapsize, PAGE_SIZE); i++) { unsigned long pfn = pmap_k0seg_to_pfn((vm_offset_t) Lev3map) + i; newpte = ALPHA_PTE_FROM_PFN(pfn); newpte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_W; Lev2map[i] = newpte; } virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VPTBASE; /* * Initialize protection array. */ alpha_protection_init(); /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_lev1 = Lev1map; kernel_pmap->pm_active = ~0; kernel_pmap->pm_asn[alpha_pal_whami()].asn = 0; kernel_pmap->pm_asn[alpha_pal_whami()].gen = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nklev3 = NKPT; nklev2 = 1; /* * Initialize list of pmaps. */ LIST_INIT(&allpmaps); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); /* * Set up proc0's PCB such that the ptbr points to the right place * and has the kernel pmap's. */ thread0.td_pcb->pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t)Lev1map) >> PAGE_SHIFT; thread0.td_pcb->pcb_hw.apcb_asn = 0; } int pmap_uses_prom_console() { int cputype; cputype = hwrpb->rpb_type; return (cputype == ST_DEC_21000 || ST_DEC_4100); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(void) { int i; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_prealloc(pvzone, MINPV); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; pv_entry_high_water = 9 * (pv_entry_max / 10); } /*************************************************** * Manipulate TLBs for a pmap ***************************************************/ static void pmap_invalidate_asn(pmap_t pmap) { pmap->pm_asn[PCPU_GET(cpuid)].gen = 0; } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { #ifdef SMP struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; smp_rendezvous(0, pmap_invalidate_page_action, 0, (void *) &arg); } static void pmap_invalidate_page_action(void *arg) { pmap_t pmap = ((struct pmap_invalidate_page_arg *) arg)->pmap; vm_offset_t va = ((struct pmap_invalidate_page_arg *) arg)->va; #endif if (pmap->pm_active & PCPU_GET(cpumask)) { ALPHA_TBIS(va); alpha_pal_imb(); /* XXX overkill? */ } else { pmap_invalidate_asn(pmap); } } static void pmap_invalidate_all(pmap_t pmap) { #ifdef SMP smp_rendezvous(0, pmap_invalidate_all_action, 0, (void *) pmap); } static void pmap_invalidate_all_action(void *arg) { pmap_t pmap = (pmap_t) arg; #endif if (pmap->pm_active & PCPU_GET(cpumask)) { ALPHA_TBIA(); alpha_pal_imb(); /* XXX overkill? */ } else pmap_invalidate_asn(pmap); } static void pmap_get_asn(pmap_t pmap) { if (PCPU_GET(next_asn) > pmap_maxasn) { /* * Start a new ASN generation. * * Invalidate all per-process mappings and I-cache */ PCPU_SET(next_asn, 0); PCPU_SET(current_asngen, (PCPU_GET(current_asngen) + 1) & ASNGEN_MASK); if (PCPU_GET(current_asngen) == 0) { /* * Clear the pm_asn[].gen of all pmaps. * This is safe since it is only called from * pmap_activate after it has deactivated * the old pmap and it only affects this cpu. */ pmap_t tpmap; #ifdef PMAP_DIAGNOSTIC printf("pmap_get_asn: generation rollover\n"); #endif PCPU_SET(current_asngen, 1); mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(tpmap, &allpmaps, pm_list) { tpmap->pm_asn[PCPU_GET(cpuid)].gen = 0; } mtx_unlock_spin(&allpmaps_lock); } /* * Since we are about to start re-using ASNs, we must * clear out the TLB and the I-cache since they are tagged * with the ASN. */ ALPHA_TBIAP(); alpha_pal_imb(); /* XXX overkill? */ } pmap->pm_asn[PCPU_GET(cpuid)].asn = PCPU_GET(next_asn); PCPU_SET(next_asn, PCPU_GET(next_asn) + 1); pmap->pm_asn[PCPU_GET(cpuid)].gen = PCPU_GET(current_asngen); } /*************************************************** * Low level helper routines..... ***************************************************/ /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; vm_paddr_t pa; pa = 0; if (pmap == NULL) return (pa); PMAP_LOCK(pmap); pte = pmap_lev3pte(pmap, va); if (pte != NULL && pmap_pte_v(pte)) pa = pmap_pte_pa(pte); PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte; vm_page_t m; m = NULL; if (pmap == NULL) return (m); vm_page_lock_queues(); PMAP_LOCK(pmap); pte = pmap_lev3pte(pmap, va); if (pte != NULL && pmap_pte_v(pte) && (*pte & pte_prot(pmap, prot)) == pte_prot(pmap, prot)) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(pte)); vm_page_hold(m); } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; pt_entry_t *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; pt_entry_t npte = pmap_phys_to_pte(VM_PAGE_TO_PHYS(m[i])) | PG_ASM | PG_KRE | PG_KWE | PG_V; pt_entry_t opte; pte = vtopte(tva); opte = *pte; *pte = npte; if (opte) pmap_invalidate_page(kernel_pmap, tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_offset_t pa) { pt_entry_t *pte; pt_entry_t npte, opte; npte = pmap_phys_to_pte(pa) | PG_ASM | PG_KRE | PG_KWE | PG_V; pte = vtopte(va); opte = *pte; *pte = npte; if (opte) pmap_invalidate_page(kernel_pmap, va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { register pt_entry_t *pte; pte = vtopte(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { return ALPHA_PHYS_TO_K0SEG(start); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) vm_page_lock_queues(); if (m->hold_count == 0) { vm_offset_t pteva; pt_entry_t* pte; /* * unmap the page table page */ if (m->pindex >= NUSERLEV3MAPS) { /* Level 2 page table */ pte = pmap_lev1pte(pmap, va); pteva = (vm_offset_t) PTlev2 + alpha_ptob(m->pindex - NUSERLEV3MAPS); } else { /* Level 3 page table */ pte = pmap_lev2pte(pmap, va); pteva = (vm_offset_t) PTmap + alpha_ptob(m->pindex); } *pte = 0; if (m->pindex < NUSERLEV3MAPS) { /* unhold the level 2 page table */ vm_page_t lev2pg; lev2pg = PHYS_TO_VM_PAGE(pmap_pte_pa(pmap_lev1pte(pmap, va))); vm_page_unhold(lev2pg); if (lev2pg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, lev2pg); } --pmap->pm_stats.resident_count; /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pmap_invalidate_page(pmap, pteva); if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_busy(m); vm_page_free_zero(m); atomic_subtract_int(&cnt.v_wire_count, 1); } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, va, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { unsigned ptepindex; if (va >= VM_MAXUSER_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> ALPHA_L2SHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = PHYS_TO_VM_PAGE(pmap_pte_pa(pmap_lev2pte(pmap, va))); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, va, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { int i; PMAP_LOCK_INIT(pmap); pmap->pm_lev1 = Lev1map; pmap->pm_ptphint = NULL; pmap->pm_active = 0; for (i = 0; i < MAXCPU; i++) { pmap->pm_asn[i].asn = 0; pmap->pm_asn[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN | MTX_QUIET); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t lev1pg; int i; PMAP_LOCK_INIT(pmap); /* - * allocate object for the ptes - */ - if (pmap->pm_pteobj == NULL) - pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUSERLEV3MAPS + NUSERLEV2MAPS + 1); - - /* * allocate the page directory page */ - VM_OBJECT_LOCK(pmap->pm_pteobj); - lev1pg = vm_page_grab(pmap->pm_pteobj, NUSERLEV3MAPS + NUSERLEV2MAPS, - VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + while ((lev1pg = vm_page_alloc(NULL, NUSERLEV3MAPS + NUSERLEV2MAPS, VM_ALLOC_NOOBJ | + VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) + VM_WAIT; - vm_page_lock_queues(); - vm_page_flag_clear(lev1pg, PG_BUSY); - lev1pg->valid = VM_PAGE_BITS_ALL; - vm_page_unlock_queues(); - VM_OBJECT_UNLOCK(pmap->pm_pteobj); - pmap->pm_lev1 = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(lev1pg)); + if ((lev1pg->flags & PG_ZERO) == 0) + bzero(pmap->pm_lev1, PAGE_SIZE); + /* install self-referential address mapping entry (not PG_ASM) */ pmap->pm_lev1[PTLEV1I] = pmap_phys_to_pte(VM_PAGE_TO_PHYS(lev1pg)) | PG_V | PG_KRE | PG_KWE; pmap->pm_ptphint = NULL; pmap->pm_active = 0; for (i = 0; i < MAXCPU; i++) { pmap->pm_asn[i].asn = 0; pmap->pm_asn[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); bcopy(PTlev1 + K1SEGLEV1I, pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); } -static int -pmap_release_free_page(pmap_t pmap, vm_page_t p) -{ - pt_entry_t* pte; - pt_entry_t* l2map; - - if (p->pindex >= NUSERLEV3MAPS + NUSERLEV2MAPS) - /* level 1 page table */ - pte = &pmap->pm_lev1[PTLEV1I]; - else if (p->pindex >= NUSERLEV3MAPS) - /* level 2 page table */ - pte = &pmap->pm_lev1[p->pindex - NUSERLEV3MAPS]; - else { - /* level 3 page table */ - pte = &pmap->pm_lev1[p->pindex >> ALPHA_PTSHIFT]; - l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(pte)); - pte = &l2map[p->pindex & ((1 << ALPHA_PTSHIFT) - 1)]; - } - - /* - * This code optimizes the case of freeing non-busy - * page-table pages. Those pages are zero now, and - * might as well be placed directly into the zero queue. - */ - vm_page_lock_queues(); - if (vm_page_sleep_if_busy(p, FALSE, "pmaprl")) - return 0; - - vm_page_busy(p); - - /* - * Remove the page table page from the processes address space. - */ - *pte = 0; - pmap->pm_stats.resident_count--; - -#ifdef PMAP_DEBUG - if (p->hold_count) { - panic("pmap_release: freeing held page table page"); - } -#endif - /* - * Level1 pages need to have the kernel - * stuff cleared, so they can go into the zero queue also. - */ - if (p->pindex == NUSERLEV3MAPS + NUSERLEV2MAPS) - bzero(pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); - - if (pmap->pm_ptphint == p) - pmap->pm_ptphint = NULL; - -#ifdef PMAP_DEBUG - { - u_long *lp = (u_long*) ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(p)); - u_long *ep = (u_long*) ((char*) lp + PAGE_SIZE); - for (; lp < ep; lp++) - if (*lp != 0) - panic("pmap_release_free_page: page not zero"); - } -#endif - - p->wire_count--; - atomic_subtract_int(&cnt.v_wire_count, 1); - vm_page_free_zero(p); - vm_page_unlock_queues(); - return 1; -} - /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { pt_entry_t* pte; vm_offset_t ptepa; vm_page_t m; - int is_object_locked; /* * Find or fabricate a new pagetable page */ - if (!(is_object_locked = VM_OBJECT_LOCKED(pmap->pm_pteobj))) - VM_OBJECT_LOCK(pmap->pm_pteobj); - m = vm_page_grab(pmap->pm_pteobj, ptepindex, - VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY); + if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + VM_WAIT; + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); if (ptepindex >= NUSERLEV3MAPS) { pte = &pmap->pm_lev1[ptepindex - NUSERLEV3MAPS]; } else { int l1index = ptepindex >> ALPHA_PTSHIFT; pt_entry_t* l1pte = &pmap->pm_lev1[l1index]; pt_entry_t* l2map; - if (!pmap_pte_v(l1pte)) - _pmap_allocpte(pmap, NUSERLEV3MAPS + l1index); - else { + if (!pmap_pte_v(l1pte)) { + if (_pmap_allocpte(pmap, NUSERLEV3MAPS + l1index) == NULL) { + vm_page_lock_queues(); + vm_page_unhold(m); + vm_page_free(m); + vm_page_unlock_queues(); + return (NULL); + } + } else { vm_page_t l2page; l2page = PHYS_TO_VM_PAGE(pmap_pte_pa(l1pte)); l2page->hold_count++; } l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l1pte)); pte = &l2map[ptepindex & ((1 << ALPHA_PTSHIFT) - 1)]; } *pte = pmap_phys_to_pte(ptepa) | PG_KRE | PG_KWE | PG_V; /* * Set the page table hint */ pmap->pm_ptphint = m; - vm_page_lock_queues(); - m->valid = VM_PAGE_BITS_ALL; - vm_page_wakeup(m); - vm_page_unlock_queues(); - if (!is_object_locked) - VM_OBJECT_UNLOCK(pmap->pm_pteobj); - return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { unsigned ptepindex; pt_entry_t* lev2pte; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> (PAGE_SHIFT + ALPHA_PTSHIFT); - +retry: /* * Get the level2 entry */ lev2pte = pmap_lev2pte(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (lev2pte && pmap_pte_v(lev2pte)) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = PHYS_TO_VM_PAGE(pmap_pte_pa(lev2pte)); pmap->pm_ptphint = m; } m->hold_count++; - return m; + } else { + /* + * Here if the pte page isn't mapped, or if it has been + * deallocated. + */ + m = _pmap_allocpte(pmap, ptepindex); + if (m == NULL) + goto retry; } - /* - * Here if the pte page isn't mapped, or if it has been deallocated. - */ - return _pmap_allocpte(pmap, ptepindex); + return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { - vm_page_t p,n,lev1pg; - vm_object_t object = pmap->pm_pteobj; - int curgeneration; + vm_page_t lev1pg; -#if defined(DIAGNOSTIC) - if (object->ref_count != 1) - panic("pmap_release: pteobj reference count != 1"); -#endif - - lev1pg = NULL; -retry: - curgeneration = object->generation; - for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { - n = TAILQ_NEXT(p, listq); - if (p->pindex >= NUSERLEV3MAPS) { - continue; - } - while (1) { - if (!pmap_release_free_page(pmap, p) && - (object->generation != curgeneration)) - goto retry; - } - } - for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { - n = TAILQ_NEXT(p, listq); - if (p->pindex < NUSERLEV3MAPS) { - /* can this happen? maybe panic */ - goto retry; - } - if (p->pindex >= NUSERLEV3MAPS + NUSERLEV2MAPS) { - lev1pg = p; - continue; - } - while (1) { - if (!pmap_release_free_page(pmap, p) && - (object->generation != curgeneration)) - goto retry; - } - } + KASSERT(pmap->pm_stats.resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->pm_stats.resident_count)); - if (lev1pg && !pmap_release_free_page(pmap, lev1pg)) - goto retry; + lev1pg = PHYS_TO_VM_PAGE(pmap_pte_pa(&pmap->pm_lev1[PTLEV1I])); + KASSERT(lev1pg->pindex == NUSERLEV3MAPS + NUSERLEV2MAPS, + ("pmap_release: PTLEV1I page has unexpected pindex %ld", + lev1pg->pindex)); + mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); + + /* + * Level1 pages need to have the kernel + * stuff cleared, so they can go into the zero queue also. + */ + bzero(pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); + pmap->pm_lev1[PTLEV1I] = 0; + PMAP_LOCK_DESTROY(pmap); + + vm_page_lock_queues(); + lev1pg->wire_count--; + atomic_subtract_int(&cnt.v_wire_count, 1); + vm_page_free_zero(lev1pg); + vm_page_unlock_queues(); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { /* XXX come back to this */ struct pmap *pmap; pt_entry_t* pte; pt_entry_t newlev1, newlev2; vm_offset_t pa; vm_page_t nkpg; critical_enter(); if (kernel_vm_end == 0) { kernel_vm_end = VM_MIN_KERNEL_ADDRESS;; /* Count the level 2 page tables */ nklev2 = 0; nklev3 = 0; while (pmap_pte_v(pmap_lev1pte(kernel_pmap, kernel_vm_end))) { nklev2++; nklev3 += (1L << ALPHA_PTSHIFT); kernel_vm_end += ALPHA_L1SIZE; } /* Count the level 3 page tables in the last level 2 page table */ kernel_vm_end -= ALPHA_L1SIZE; nklev3 -= (1 << ALPHA_PTSHIFT); while (pmap_pte_v(pmap_lev2pte(kernel_pmap, kernel_vm_end))) { nklev3++; kernel_vm_end += ALPHA_L2SIZE; } } addr = (addr + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); while (kernel_vm_end < addr) { /* * If the level 1 pte is invalid, allocate a new level 2 page table */ pte = pmap_lev1pte(kernel_pmap, kernel_vm_end); if (!pmap_pte_v(pte)) { int pindex = NKLEV3MAPS + pmap_lev1_index(kernel_vm_end) - K1SEGLEV1I; nkpg = vm_page_alloc(NULL, pindex, VM_ALLOC_NOOBJ | VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); printf("pmap_growkernel: growing to %lx\n", addr); printf("pmap_growkernel: adding new level2 page table\n"); nklev2++; pmap_zero_page(nkpg); pa = VM_PAGE_TO_PHYS(nkpg); newlev1 = pmap_phys_to_pte(pa) | PG_V | PG_ASM | PG_KRE | PG_KWE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { *pmap_lev1pte(pmap, kernel_vm_end) = newlev1; } mtx_unlock_spin(&allpmaps_lock); *pte = newlev1; pmap_invalidate_all(kernel_pmap); } /* * If the level 2 pte is invalid, allocate a new level 3 page table */ pte = pmap_lev2pte(kernel_pmap, kernel_vm_end); if (pmap_pte_v(pte)) { kernel_vm_end = (kernel_vm_end + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nklev3, VM_ALLOC_NOOBJ | VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nklev3++; pmap_zero_page(nkpg); pa = VM_PAGE_TO_PHYS(nkpg); newlev2 = pmap_phys_to_pte(pa) | PG_V | PG_ASM | PG_KRE | PG_KWE; *pte = newlev2; kernel_vm_end = (kernel_vm_end + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); } critical_exit(); } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; int rtval; if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) { pv_entry_t pv; pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; vm_page_lock_queues(); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; vm_page_unlock_queues(); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) { pt_entry_t oldpte; vm_page_t m; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(&oldpte)); if ((oldpte & PG_FOW) == 0) { if (pmap_track_modified(va)) vm_page_dirty(m); } if ((oldpte & PG_FOR) == 0) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va, NULL); } } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { register pt_entry_t *ptq; PMAP_LOCK_ASSERT(pmap, MA_OWNED); ptq = pmap_lev3pte(pmap, va); /* * if there is no pte for this address, just skip it!!! */ if (!ptq || !pmap_pte_v(ptq)) return; /* * get a local va for mappings for this pmap. */ (void) pmap_remove_pte(pmap, ptq, va); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va, nva; if (pmap == NULL) return; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; vm_page_lock_queues(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pmap_remove_page(pmap, sva); goto out; } for (va = sva; va < eva; va = nva) { if (!pmap_pte_v(pmap_lev1pte(pmap, va))) { nva = alpha_l1trunc(va + ALPHA_L1SIZE); continue; } if (!pmap_pte_v(pmap_lev2pte(pmap, va))) { nva = alpha_l2trunc(va + ALPHA_L2SIZE); continue; } pmap_remove_page(pmap, va); nva = va + PAGE_SIZE; } out: vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; pt_entry_t *pte, tpte; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m)); } #endif while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; if (pmap_pte_pa(pte) != VM_PAGE_TO_PHYS(m)) panic("pmap_remove_all: pv_table for %lx is inconsistent", VM_PAGE_TO_PHYS(m)); tpte = *pte; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if ((tpte & PG_FOW) == 0) { if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } if ((tpte & PG_FOR) == 0) vm_page_flag_set(m, PG_REFERENCED); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); PMAP_UNLOCK(pv->pv_pmap); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t* pte; int newprot; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; newprot = pte_prot(pmap, prot); if ((sva & PAGE_MASK) || (eva & PAGE_MASK)) panic("pmap_protect: unaligned addresses"); vm_page_lock_queues(); PMAP_LOCK(pmap); while (sva < eva) { /* * If level 1 pte is invalid, skip this segment */ pte = pmap_lev1pte(pmap, sva); if (!pmap_pte_v(pte)) { sva = alpha_l1trunc(sva) + ALPHA_L1SIZE; continue; } /* * If level 2 pte is invalid, skip this segment */ pte = pmap_lev2pte(pmap, sva); if (!pmap_pte_v(pte)) { sva = alpha_l2trunc(sva) + ALPHA_L2SIZE; continue; } /* * If level 3 pte is invalid, skip this page */ pte = pmap_lev3pte(pmap, sva); if (!pmap_pte_v(pte)) { sva += PAGE_SIZE; continue; } if (pmap_pte_prot(pte) != newprot) { pt_entry_t oldpte = *pte; vm_page_t m = NULL; if ((oldpte & PG_FOR) == 0) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(pte)); vm_page_flag_set(m, PG_REFERENCED); oldpte |= (PG_FOR | PG_FOE); } if ((oldpte & PG_FOW) == 0) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(pte)); if (pmap_track_modified(sva)) vm_page_dirty(m); oldpte |= PG_FOW; } oldpte = (oldpte & ~PG_PROT) | newprot; *pte = oldpte; pmap_invalidate_page(pmap, sva); } sva += PAGE_SIZE; } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_offset_t pa; pt_entry_t *pte; vm_offset_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; int managed; if (pmap == NULL) return; va &= ~PAGE_MASK; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } pte = pmap_lev3pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid kernel page tables pmap=%p, va=0x%lx\n", pmap, va); } origpte = *pte; pa = VM_PAGE_TO_PHYS(m) & ~PAGE_MASK; managed = 0; opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_FOW) != PG_FOW && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } } managed = origpte & PG_MANAGED; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; vm_page_lock_queues(); err = pmap_remove_pte(pmap, pte, va); vm_page_unlock_queues(); if (err) panic("pmap_enter: pte vanished, va: 0x%lx", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); managed |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = pmap_phys_to_pte(pa) | pte_prot(pmap, prot) | PG_V | managed; if (managed) { /* * Set up referenced/modified emulation for the new * mapping. Any old referenced/modified emulation * results for the old mapping will have been recorded * either in pmap_remove_pte() or above in the code * which handles protection and/or wiring changes. */ newpte |= (PG_FOR | PG_FOW | PG_FOE); } if (wired) newpte |= PG_W; /* * if the mapping or permission bits are different, we need * to update the pte. */ if (origpte != newpte) { *pte = newpte; if (origpte) pmap_invalidate_page(pmap, va); if (prot & VM_PROT_EXECUTE) alpha_pal_imb(); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { register pt_entry_t *pte; int managed; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pt_entry_t* l2pte; /* * Calculate lev2 page index */ ptepindex = va >> ALPHA_L2SHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { + retry: /* * Get the level 2 entry */ l2pte = pmap_lev2pte(pmap, va); /* * If the level 2 page table is mapped, we just increment * the hold count, and activate it. */ if (l2pte && pmap_pte_v(l2pte)) { if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = PHYS_TO_VM_PAGE(pmap_pte_pa(l2pte)); pmap->pm_ptphint = mpte; } mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); + if (mpte == NULL) + goto retry; } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { vm_page_lock_queues(); pmap_unwire_pte_hold(pmap, va, mpte); vm_page_unlock_queues(); } alpha_pal_imb(); /* XXX overkill? */ return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ managed = 0; if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); managed = PG_MANAGED | PG_FOR | PG_FOW | PG_FOE; } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pmap_phys_to_pte(VM_PAGE_TO_PHYS(m)) | PG_V | PG_KRE | PG_URE | managed; alpha_pal_imb(); /* XXX overkill? */ return mpte; } /* * Make temporary mapping for a physical address. This is called * during dump. */ void * pmap_kenter_temporary(vm_offset_t pa, int i) { return (void *) ALPHA_PHYS_TO_K0SEG(pa - (i * PAGE_SIZE)); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { pt_entry_t *pte; if (pmap == NULL) return; PMAP_LOCK(pmap); pte = pmap_lev3pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(m)); bzero((char *)(caddr_t)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. This is for the vm_pagezero idle process. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(mdst)); bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap, m) pmap_t pmap; vm_page_t m; { pv_entry_t pv; int loops = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; /* * Not found, check current mappings returning immediately if found. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; vm_page_t m; pv_entry_t pv, npv; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif vm_page_lock_queues(); PMAP_LOCK(pmap); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte_quick(pmap, pv->pv_va); #endif if (!pmap_pte_v(pte)) panic("pmap_remove_pages: page on pm_pvlist has no pte\n"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; m = PHYS_TO_VM_PAGE(pmap_pte_pa(&tpte)); pmap->pm_stats.resident_count--; if ((tpte & PG_FOW) == 0) if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); vm_page_unlock_queues(); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(vm_page_t m, int bit, boolean_t setem) { pv_entry_t pv; pt_entry_t *pte; int changed; if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || (!setem && bit == (PG_UWE|PG_KWE) && (m->flags & PG_WRITEABLE) == 0)) return; changed = 0; /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (!setem && bit == (PG_UWE|PG_KWE)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); continue; } #endif PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); changed = 0; if (setem) { *pte |= bit; changed = 1; } else { pt_entry_t pbits = *pte; if (pbits & bit) { changed = 1; *pte = pbits & ~bit; } } if (changed) pmap_invalidate_page(pv->pv_pmap, pv->pv_va); PMAP_UNLOCK(pv->pv_pmap); } if (!setem && bit == (PG_UWE|PG_KWE)) vm_page_flag_clear(m, PG_WRITEABLE); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, PG_KWE|PG_UWE, FALSE); } else { pmap_remove_all(m); } } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; int count; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return 0; /* * Loop over current mappings looking for any which have don't * have PG_FOR set (i.e. ones where we have taken an emulate * reference trap recently). */ count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & PG_FOR)) { count++; *pte |= PG_FOR | PG_FOE; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } return count; } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; boolean_t rv; rv = FALSE; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rv); /* * A page is modified if any mapping has had its PG_FOW flag * cleared. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); rv = !(*pte & PG_FOW); PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); if (pmap_pte_v(pmap_lev1pte(pmap, addr)) && pmap_pte_v(pmap_lev2pte(pmap, addr))) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; /* * Loop over current mappings setting PG_FOW where needed. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & PG_FOW)) { *pte |= PG_FOW; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; /* * Loop over current mappings setting PG_FOR and PG_FOE where needed. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & (PG_FOR | PG_FOE))) { *pte |= (PG_FOR | PG_FOE); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } } /* * pmap_emulate_reference: * * Emulate reference and/or modified bit hits. * From NetBSD */ void pmap_emulate_reference(struct vmspace *vm, vm_offset_t v, int user, int write) { pmap_t pmap; pt_entry_t faultoff, *pte; /* * Convert process and virtual address to physical address. */ if (v >= VM_MIN_KERNEL_ADDRESS) { if (user) panic("pmap_emulate_reference: user ref to kernel"); pmap = kernel_pmap; PMAP_LOCK(pmap); pte = vtopte(v); } else { KASSERT(vm != NULL, ("pmap_emulate_reference: bad vmspace")); pmap = &vm->vm_pmap; PMAP_LOCK(pmap); pte = pmap_lev3pte(pmap, v); } #ifdef DEBUG /* These checks are more expensive */ if (!pmap_pte_v(pte)) panic("pmap_emulate_reference: invalid pte"); #if 0 /* * Can't do these, because cpu_fork and cpu_swapin call * pmap_emulate_reference(), and the bits aren't guaranteed, * for them... */ if (write) { if (!(*pte & (user ? PG_UWE : PG_UWE | PG_KWE))) panic("pmap_emulate_reference: write but unwritable"); if (!(*pte & PG_FOW)) panic("pmap_emulate_reference: write but not FOW"); } else { if (!(*pte & (user ? PG_URE : PG_URE | PG_KRE))) panic("pmap_emulate_reference: !write but unreadable"); if (!(*pte & (PG_FOR | PG_FOE))) panic("pmap_emulate_reference: !write but not FOR|FOE"); } #endif /* Other diagnostics? */ #endif KASSERT((*pte & PG_MANAGED) != 0, ("pmap_emulate_reference(%p, 0x%lx, %d, %d): pa 0x%lx not managed", curthread, v, user, write, pmap_pte_pa(pte))); /* * Twiddle the appropriate bits to reflect the reference * and/or modification.. * * The rules: * (1) always mark page as used, and * (2) if it was a write fault, mark page as modified. */ if (write) { faultoff = PG_FOR | PG_FOE | PG_FOW; } else { faultoff = PG_FOR | PG_FOE; } *pte = (*pte & ~faultoff); ALPHA_TBIS(v); PMAP_UNLOCK(pmap); } /* * Miscellaneous support routines follow */ static void alpha_protection_init() { int prot, *kp, *up; kp = protection_codes[0]; up = protection_codes[1]; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: *kp++ = PG_ASM; *up++ = 0; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = PG_ASM | PG_KRE; *up++ = PG_URE | PG_KRE; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: *kp++ = PG_ASM | PG_KWE; *up++ = PG_UWE | PG_KWE; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_ASM | PG_KWE | PG_KRE; *up++ = PG_UWE | PG_URE | PG_KWE | PG_KRE; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { return (void*) ALPHA_PHYS_TO_K0SEG(pa); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *ptep, pte; int val = 0; PMAP_LOCK(pmap); ptep = pmap_lev3pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; PMAP_UNLOCK(pmap); if (pte & PG_V) { vm_page_t m; vm_offset_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = alpha_ptob(ALPHA_PTE_TO_PFN(pte)); m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if ((pte & PG_FOW) == 0) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if ((pte & (PG_FOR | PG_FOE)) == 0) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_t pmap; pmap = vmspace_pmap(td->td_proc->p_vmspace); critical_enter(); if (pmap_active[PCPU_GET(cpuid)] && pmap != pmap_active[PCPU_GET(cpuid)]) { atomic_clear_32(&pmap_active[PCPU_GET(cpuid)]->pm_active, PCPU_GET(cpumask)); pmap_active[PCPU_GET(cpuid)] = 0; } td->td_pcb->pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t) pmap->pm_lev1) >> PAGE_SHIFT; if (pmap->pm_asn[PCPU_GET(cpuid)].gen != PCPU_GET(current_asngen)) pmap_get_asn(pmap); pmap_active[PCPU_GET(cpuid)] = pmap; atomic_set_32(&pmap->pm_active, PCPU_GET(cpumask)); td->td_pcb->pcb_hw.apcb_asn = pmap->pm_asn[PCPU_GET(cpuid)].asn; critical_exit(); if (td == curthread) { alpha_pal_swpctx((u_long)td->td_md.md_pcbpaddr); } } void pmap_deactivate(struct thread *td) { pmap_t pmap; pmap = vmspace_pmap(td->td_proc->p_vmspace); atomic_clear_32(&pmap->pm_active, PCPU_GET(cpumask)); pmap_active[PCPU_GET(cpuid)] = 0; } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return addr; } #if 0 #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte_quick(pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_offset_t pa); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { int i, j; vm_offset_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPG; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } void pmap_pvdump(pa) vm_offset_t pa; { pv_entry_t pv; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif #endif Index: head/sys/alpha/include/pmap.h =================================================================== --- head/sys/alpha/include/pmap.h (revision 132413) +++ head/sys/alpha/include/pmap.h (revision 132414) @@ -1,243 +1,242 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 * from: i386 pmap.h,v 1.54 1997/11/20 19:30:35 bde Exp * $FreeBSD$ */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ /* * Define meanings for a few software bits in the pte */ #define PG_V ALPHA_PTE_VALID #define PG_FOR ALPHA_PTE_FAULT_ON_READ #define PG_FOW ALPHA_PTE_FAULT_ON_WRITE #define PG_FOE ALPHA_PTE_FAULT_ON_EXECUTE #define PG_ASM ALPHA_PTE_ASM #define PG_GH ALPHA_PTE_GRANULARITY #define PG_KRE ALPHA_PTE_KR #define PG_URE ALPHA_PTE_UR #define PG_KWE ALPHA_PTE_KW #define PG_UWE ALPHA_PTE_UW #define PG_PROT ALPHA_PTE_PROT #define PG_SHIFT 32 #define PG_W 0x00010000 /* software wired */ #define PG_MANAGED 0x00020000 /* software managed */ /* * Pte related macros */ #define VADDR(l1, l2, l3) (((l1) << ALPHA_L1SHIFT) \ + ((l2) << ALPHA_L2SHIFT) \ + ((l3) << ALPHA_L3SHIFT) #ifndef NKPT #define NKPT 9 /* initial number of kernel page tables */ #endif #define NKLEV2MAPS 255 /* max number of lev2 page tables */ #define NKLEV3MAPS (NKLEV2MAPS << ALPHA_PTSHIFT) /* max number of lev3 page tables */ /* * The *PTDI values control the layout of virtual memory * * XXX This works for now, but I am not real happy with it, I'll fix it * right after I fix locore.s and the magic 28K hole * * SMP_PRIVPAGES: The per-cpu address space is 0xff80000 -> 0xffbfffff */ #define PTLEV1I (NPTEPG-1) /* Lev0 entry that points to Lev0 */ #define K0SEGLEV1I (NPTEPG/2) #define K1SEGLEV1I (K0SEGLEV1I+(NPTEPG/4)) #define NUSERLEV2MAPS (NPTEPG/2) #define NUSERLEV3MAPS (NUSERLEV2MAPS << ALPHA_PTSHIFT) #ifndef LOCORE #include #include #include typedef alpha_pt_entry_t pt_entry_t; #define PTESIZE sizeof(pt_entry_t) /* for assembly files */ /* * Address of current address space page table maps */ #ifdef _KERNEL extern pt_entry_t PTmap[]; /* lev3 page tables */ extern pt_entry_t PTlev2[]; /* lev2 page tables */ extern pt_entry_t PTlev1[]; /* lev1 page table */ extern pt_entry_t PTlev1pte; /* pte that maps lev1 page table */ #endif #ifdef _KERNEL /* * virtual address to page table entry and * to physical address. * Note: this work recursively, thus vtopte of a pte will give * the corresponding lev1 that in turn maps it. */ #define vtopte(va) (PTmap + (alpha_btop(va) \ & ((1 << 3*ALPHA_PTSHIFT)-1))) /* * Routine: pmap_kextract * Function: * Extract the physical page address associated * kernel virtual address. */ static __inline vm_offset_t pmap_kextract(vm_offset_t va) { vm_offset_t pa; if (va >= ALPHA_K0SEG_BASE && va <= ALPHA_K0SEG_END) pa = ALPHA_K0SEG_TO_PHYS(va); else pa = alpha_ptob(ALPHA_PTE_TO_PFN(*vtopte(va))) | (va & PAGE_MASK); return pa; } #define vtophys(va) pmap_kextract(((vm_offset_t) (va))) extern vm_offset_t alpha_XXX_dmamap_or; static __inline vm_offset_t alpha_XXX_dmamap(vm_offset_t va) { return (pmap_kextract(va) | alpha_XXX_dmamap_or); } #endif /* _KERNEL */ /* * Pmap stuff */ struct pv_entry; struct md_page { int pv_list_count; TAILQ_HEAD(,pv_entry) pv_list; }; #define ASN_BITS 8 #define ASNGEN_BITS (32 - ASN_BITS) #define ASNGEN_MASK ((1 << ASNGEN_BITS) - 1) struct pmap { struct mtx pm_mtx; pt_entry_t *pm_lev1; /* KVA of lev0map */ - vm_object_t pm_pteobj; /* Container for pte's */ TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ u_int32_t pm_active; /* active cpus */ struct { u_int32_t asn:ASN_BITS; /* address space number */ u_int32_t gen:ASNGEN_BITS; /* generation number */ } pm_asn[MAXSMPCPU]; struct pmap_statistics pm_stats; /* pmap statistics */ struct vm_page *pm_ptphint; /* pmap ptp hint */ LIST_ENTRY(pmap) pm_list; /* list of all pmaps. */ }; typedef struct pmap *pmap_t; #ifdef _KERNEL extern struct pmap kernel_pmap_store; #define kernel_pmap (&kernel_pmap_store) #define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) #define PMAP_LOCK_ASSERT(pmap, type) \ mtx_assert(&(pmap)->pm_mtx, (type)) #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) #define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ NULL, MTX_DEF) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) #define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) #endif /* * For each vm_page_t, there is a list of all currently valid virtual * mappings of that page. An entry is a pv_entry_t, the list is pv_table. */ typedef struct pv_entry { pmap_t pv_pmap; /* pmap where mapping lies */ vm_offset_t pv_va; /* virtual address for mapping */ TAILQ_ENTRY(pv_entry) pv_list; TAILQ_ENTRY(pv_entry) pv_plist; vm_page_t pv_ptem; /* VM page for pte */ } *pv_entry_t; #ifdef _KERNEL extern vm_offset_t phys_avail[]; extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; struct vmspace; #define pmap_page_is_mapped(m) (!TAILQ_EMPTY(&(m)->md.pv_list)) vm_offset_t pmap_steal_memory(vm_size_t); void pmap_bootstrap(vm_offset_t, u_int); void pmap_kenter(vm_offset_t va, vm_offset_t pa); void *pmap_kenter_temporary(vm_offset_t pa, int i); void pmap_kremove(vm_offset_t); void pmap_setdevram(unsigned long long basea, vm_offset_t sizea); int pmap_uses_prom_console(void); void *pmap_mapdev(vm_offset_t, vm_size_t); void pmap_unmapdev(vm_offset_t, vm_size_t); unsigned *pmap_pte(pmap_t, vm_offset_t) __pure2; void pmap_set_opt (unsigned *); void pmap_set_opt_bsp (void); void pmap_deactivate(struct thread *td); void pmap_emulate_reference(struct vmspace *vm, vm_offset_t v, int user, int write); #endif /* _KERNEL */ #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 132413) +++ head/sys/vm/vm_page.c (revision 132414) @@ -1,1751 +1,1749 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - a pageq mutex is required when adding or removing a page from a * page queue (vm_page_queue[]), regardless of other mutexes or the * busy state of a page. * * - a hash chain mutex is required when associating or disassociating * a page from the VM PAGE CACHE hash table (vm_page_buckets), * regardless of other mutexes or the busy state of a page. * * - either a hash chain mutex OR a busied page is required in order * to modify the page flags. A hash chain mutex must be obtained in * order to busy a page. A page's flags cannot be modified by a * hash chain mutex if the page is marked busy. * * - The object memq mutex is held when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). This * is different from the object's main mutex. * * Generally speaking, you have to be aware of side effects when running * vm_page ops. A vm_page_lookup() will return with the hash chain * locked, whether it was able to lookup the page or not. vm_page_free(), * vm_page_cache(), vm_page_activate(), and a number of other routines * will release the hash chain mutex for you. Intermediate manipulation * routines such as vm_page_flag_set() expect the hash chain to be held * on entry and the hash chain will remain held on return. * * pageq scanning can only occur with the pageq in question locked. * We have a known bottleneck with the active queue, but the cache * and free queues are actually arrays already. */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct mtx vm_page_queue_mtx; struct mtx vm_page_queue_free_mtx; vm_page_t vm_page_array = 0; int vm_page_array_size = 0; long first_page = 0; int vm_page_zero_count = 0; /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (cnt.v_page_size == 0) cnt.v_page_size = PAGE_SIZE; if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_size_t npages; vm_paddr_t page_range; vm_paddr_t new_end; int i; vm_paddr_t pa; int nblocks; vm_paddr_t last_pa; /* the biggest memory array is the second group of pages */ vm_paddr_t end; vm_paddr_t biggestsize; int biggestone; vm_paddr_t total; vm_size_t bootpages; total = 0; biggestsize = 0; biggestone = 0; nblocks = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } ++nblocks; total += size; } end = phys_avail[biggestone+1]; /* * Initialize the locks. */ mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF); mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL, MTX_SPIN); /* * Initialize the queue headers for the free queue, the active queue * and the inactive queue. */ vm_pageq_init(); /* * Allocate memory for use when boot strapping the kernel memory * allocator. */ bootpages = UMA_BOOT_PAGES * UMA_SLAB_SIZE; new_end = end - bootpages; new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((caddr_t) mapped, end - new_end); uma_startup((caddr_t)mapped); /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = phys_avail[0] / PAGE_SIZE; page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page; npages = (total - (page_range * sizeof(struct vm_page)) - (end - new_end)) / PAGE_SIZE; end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. */ vaddr += PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; phys_avail[biggestone + 1] = new_end; /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); vm_page_array_size = page_range; /* * Construct the free queue(s) in descending order (by physical * address) so that the first 16MB of physical memory is allocated * last rather than first. On large-memory machines, this avoids * the exhaustion of low physical memory before isa_dmainit has run. */ cnt.v_page_count = 0; cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa && npages-- > 0) { vm_pageq_add_new_page(pa); pa += PAGE_SIZE; } } return (vaddr); } void vm_page_flag_set(vm_page_t m, unsigned short bits) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->flags |= bits; } void vm_page_flag_clear(vm_page_t m, unsigned short bits) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->flags &= ~bits; } void vm_page_busy(vm_page_t m) { KASSERT((m->flags & PG_BUSY) == 0, ("vm_page_busy: page already busy!!!")); vm_page_flag_set(m, PG_BUSY); } /* * vm_page_flash: * * wakeup anyone waiting for the page. */ void vm_page_flash(vm_page_t m) { if (m->flags & PG_WANTED) { vm_page_flag_clear(m, PG_WANTED); wakeup(m); } } /* * vm_page_wakeup: * * clear the PG_BUSY flag and wakeup anyone waiting for the * page. * */ void vm_page_wakeup(vm_page_t m) { KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!")); vm_page_flag_clear(m, PG_BUSY); vm_page_flash(m); } void vm_page_io_start(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->busy++; } void vm_page_io_finish(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->busy--; if (m->busy == 0) vm_page_flash(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); --mem->hold_count; KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!")); if (mem->hold_count == 0 && mem->queue == PQ_HOLD) vm_page_free_toq(mem); } /* * vm_page_free: * * Free a page * * The clearing of PG_ZERO is a temporary safety until the code can be * reviewed to determine that PG_ZERO is being properly cleared on * write faults or maps. PG_ZERO was previously cleared in * vm_page_alloc(). */ void vm_page_free(vm_page_t m) { vm_page_flag_clear(m, PG_ZERO); vm_page_free_toq(m); vm_page_zero_idle_wakeup(); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { vm_page_flag_set(m, PG_ZERO); vm_page_free_toq(m); } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if PG_BUSY is set or, * if also_m_busy is TRUE, busy is non-zero. Returns TRUE if the * thread slept and the page queues lock was released. * Otherwise, retains the page queues lock and returns FALSE. */ int vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg) { int is_object_locked; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) { vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); /* * Remove mtx_owned() after vm_object locking is finished. */ if ((is_object_locked = m->object != NULL && mtx_owned(&m->object->mtx))) mtx_unlock(&m->object->mtx); msleep(m, &vm_page_queue_mtx, PDROP | PVM, msg, 0); if (is_object_locked) mtx_lock(&m->object->mtx); return (TRUE); } return (FALSE); } /* * vm_page_dirty: * * make page all dirty */ void vm_page_dirty(vm_page_t m) { KASSERT(m->queue - m->pc != PQ_CACHE, ("vm_page_dirty: page in cache!")); KASSERT(m->queue - m->pc != PQ_FREE, ("vm_page_dirty: page is free!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_splay: * * Implements Sleator and Tarjan's top-down splay algorithm. Returns * the vm_page containing the given pindex. If, however, that * pindex is not found in the vm_object, returns a vm_page that is * adjacent to the pindex, coming before or after it. */ vm_page_t vm_page_splay(vm_pindex_t pindex, vm_page_t root) { struct vm_page dummy; vm_page_t lefttreemax, righttreemin, y; if (root == NULL) return (root); lefttreemax = righttreemin = &dummy; for (;; root = y) { if (pindex < root->pindex) { if ((y = root->left) == NULL) break; if (pindex < y->pindex) { /* Rotate right. */ root->left = y->right; y->right = root; root = y; if ((y = root->left) == NULL) break; } /* Link into the new root's right tree. */ righttreemin->left = root; righttreemin = root; } else if (pindex > root->pindex) { if ((y = root->right) == NULL) break; if (pindex > y->pindex) { /* Rotate left. */ root->right = y->left; y->left = root; root = y; if ((y = root->right) == NULL) break; } /* Link into the new root's left tree. */ lefttreemax->right = root; lefttreemax = root; } else break; } /* Assemble the new root. */ lefttreemax->right = root->left; righttreemin->left = root->right; root->left = dummy.right; root->right = dummy.left; return (root); } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The pagetables are not updated but will presumably fault the page * in if necessary, or if a kernel page the caller will at some point * enter the page into the kernel's pmap. We are not allowed to block * here so we *can't* do this anyway. * * The object and page must be locked. * This routine may not block. */ void vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t root; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if (m->object != NULL) panic("vm_page_insert: page already inserted"); /* * Record the object/offset pair in this page */ m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ root = object->root; if (root == NULL) { m->left = NULL; m->right = NULL; TAILQ_INSERT_TAIL(&object->memq, m, listq); } else { root = vm_page_splay(pindex, root); if (pindex < root->pindex) { m->left = root->left; m->right = root; root->left = NULL; TAILQ_INSERT_BEFORE(root, m, listq); } else if (pindex == root->pindex) panic("vm_page_insert: offset already allocated"); else { m->right = root->right; m->left = root; root->right = NULL; TAILQ_INSERT_AFTER(&object->memq, root, m, listq); } } object->root = m; object->generation++; /* * show that the object has one more resident page. */ object->resident_page_count++; /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags. */ if (m->flags & PG_WRITEABLE) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * NOTE: used by device pager as well -wfj * * Removes the given mem entry from the object/offset-page * table and the object page list, but do not invalidate/terminate * the backing store. * * The object and page must be locked. * The underlying pmap entry (if any) is NOT removed here. * This routine may not block. */ void vm_page_remove(vm_page_t m) { vm_object_t object; vm_page_t root; mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->object == NULL) return; -#ifndef __alpha__ VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); -#endif if ((m->flags & PG_BUSY) == 0) { panic("vm_page_remove: page not busy"); } /* * Basically destroy the page. */ vm_page_wakeup(m); object = m->object; /* * Now remove from the object's list of backed pages. */ if (m != object->root) vm_page_splay(m->pindex, object->root); if (m->left == NULL) root = m->right; else { root = vm_page_splay(m->pindex, m->left); root->right = m->right; } object->root = root; TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; object->generation++; m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. * This routine may not block. * This is a critical path routine */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); if ((m = object->root) != NULL && m->pindex != pindex) { m = vm_page_splay(pindex, m); if ((object->root = m)->pindex != pindex) m = NULL; } return (m); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * The object must be locked. * This routine may not block. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. If the page is on the cache, we have to deactivate it * or vm_page_dirty() will panic. Dirty pages are not allowed * on the cache. */ void vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_remove(m); vm_page_insert(m, new_object, new_pindex); if (m->queue - m->pc == PQ_CACHE) vm_page_deactivate(m); vm_page_dirty(m); } /* * vm_page_select_cache: * * Find a page on the cache queue with color optimization. As pages * might be found, but not applicable, they are deactivated. This * keeps us from using potentially busy cached pages. * * This routine may not block. */ vm_page_t vm_page_select_cache(int color) { vm_page_t m; mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((m = vm_pageq_find(PQ_CACHE, color, FALSE)) != NULL) { if ((m->flags & PG_BUSY) == 0 && m->busy == 0 && m->hold_count == 0 && (VM_OBJECT_TRYLOCK(m->object) || VM_OBJECT_LOCKED(m->object))) { KASSERT(m->dirty == 0, ("Found dirty cache page %p", m)); KASSERT(!pmap_page_is_mapped(m), ("Found mapped cache page %p", m)); KASSERT((m->flags & PG_UNMANAGED) == 0, ("Found unmanaged cache page %p", m)); KASSERT(m->wire_count == 0, ("Found wired cache page %p", m)); break; } vm_page_deactivate(m); } return (m); } /* * vm_page_alloc: * * Allocate and return a memory cell associated * with this VM object/offset pair. * * page_req classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * VM_ALLOC_ZERO zero page * * This routine may not block. * * Additional special handling is required when called from an * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with * the page cache in this case. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { vm_object_t m_object; vm_page_t m = NULL; int color, flags, page_req; page_req = req & VM_ALLOC_CLASS_MASK; if ((req & VM_ALLOC_NOOBJ) == 0) { KASSERT(object != NULL, ("vm_page_alloc: NULL object.")); VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); color = (pindex + object->pg_color) & PQ_L2_MASK; } else color = pindex & PQ_L2_MASK; /* * The pager is allowed to eat deeper into the free page list. */ if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { page_req = VM_ALLOC_SYSTEM; }; loop: mtx_lock_spin(&vm_page_queue_free_mtx); if (cnt.v_free_count > cnt.v_free_reserved || (page_req == VM_ALLOC_SYSTEM && cnt.v_cache_count == 0 && cnt.v_free_count > cnt.v_interrupt_free_min) || (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) { /* * Allocate from the free queue if the number of free pages * exceeds the minimum for the request class. */ m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0); } else if (page_req != VM_ALLOC_INTERRUPT) { mtx_unlock_spin(&vm_page_queue_free_mtx); /* * Allocatable from cache (non-interrupt only). On success, * we must free the page and try again, thus ensuring that * cnt.v_*_free_min counters are replenished. */ vm_page_lock_queues(); if ((m = vm_page_select_cache(color)) == NULL) { #if defined(DIAGNOSTIC) if (cnt.v_cache_count > 0) printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count); #endif vm_page_unlock_queues(); atomic_add_int(&vm_pageout_deficit, 1); pagedaemon_wakeup(); return (NULL); } m_object = m->object; VM_OBJECT_LOCK_ASSERT(m_object, MA_OWNED); vm_page_busy(m); vm_page_free(m); vm_page_unlock_queues(); if (m_object != object) VM_OBJECT_UNLOCK(m_object); goto loop; } else { /* * Not allocatable from cache from interrupt, give up. */ mtx_unlock_spin(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, 1); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT( m != NULL, ("vm_page_alloc(): missing page on free queue") ); /* * Remove from free queue */ vm_pageq_remove_nowakeup(m); /* * Initialize structure. Only the PG_ZERO flag is inherited. */ flags = PG_BUSY; if (m->flags & PG_ZERO) { vm_page_zero_count--; if (req & VM_ALLOC_ZERO) flags = PG_ZERO | PG_BUSY; } if (req & VM_ALLOC_NOOBJ) flags &= ~PG_BUSY; m->flags = flags; if (req & VM_ALLOC_WIRED) { atomic_add_int(&cnt.v_wire_count, 1); m->wire_count = 1; } else m->wire_count = 0; m->hold_count = 0; m->act_count = 0; m->busy = 0; m->valid = 0; KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m)); mtx_unlock_spin(&vm_page_queue_free_mtx); if ((req & VM_ALLOC_NOOBJ) == 0) vm_page_insert(m, object, pindex); else m->pindex = pindex; /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } /* * vm_wait: (also see VM_WAIT macro) * * Block until free pages are available for allocation * - Called in various places before memory allocations. */ void vm_wait(void) { vm_page_lock_queues(); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_mtx, PDROP | PSWP, "VMWait", 0); } else { if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Block until free pages are available for allocation * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { vm_page_lock_queues(); if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&cnt.v_free_count, &vm_page_queue_mtx, PDROP | PUSER, "pfault", 0); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page queues must be locked. * This routine may not block. */ void vm_page_activate(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->queue != PQ_ACTIVE) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_pageq_remove(m); if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; vm_pageq_enqueue(PQ_ACTIVE, m); } } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq() and vm_page_cache(). This * routine is called when a page has been added to the cache or free * queues. * * The page queues must be locked. * This routine may not block. */ static __inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = 0; wakeup(&cnt.v_free_count); } } /* * vm_page_free_toq: * * Returns the given page to the PQ_FREE list, * disassociating it with any VM object. * * Object and page must be locked prior to entry. * This routine may not block. */ void vm_page_free_toq(vm_page_t m) { struct vpgqueues *pq; vm_object_t object = m->object; mtx_assert(&vm_page_queue_mtx, MA_OWNED); cnt.v_tfree++; if (m->busy || ((m->queue - m->pc) == PQ_FREE)) { printf( "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n", (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0, m->hold_count); if ((m->queue - m->pc) == PQ_FREE) panic("vm_page_free: freeing free page"); else panic("vm_page_free: freeing busy page"); } /* * unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_pageq_remove_nowakeup(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) { if (m->wire_count > 1) { panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx", m->wire_count, (long)m->pindex); } panic("vm_page_free: freeing wired page"); } /* * If we've exhausted the object's resident pages we want to free * it up. */ if (object && (object->type == OBJT_VNODE) && ((object->flags & OBJ_DEAD) == 0) ) { struct vnode *vp = (struct vnode *)object->handle; if (vp) { VI_LOCK(vp); if (VSHOULDFREE(vp)) vfree(vp); VI_UNLOCK(vp); } } /* * Clear the UNMANAGED flag when freeing an unmanaged page. */ if (m->flags & PG_UNMANAGED) { m->flags &= ~PG_UNMANAGED; } if (m->hold_count != 0) { m->flags &= ~PG_ZERO; m->queue = PQ_HOLD; } else m->queue = PQ_FREE + m->pc; pq = &vm_page_queues[m->queue]; mtx_lock_spin(&vm_page_queue_free_mtx); pq->lcnt++; ++(*pq->cnt); /* * Put zero'd pages on the end ( where we look for zero'd pages * first ) and non-zerod pages at the head. */ if (m->flags & PG_ZERO) { TAILQ_INSERT_TAIL(&pq->pl, m, pageq); ++vm_page_zero_count; } else { TAILQ_INSERT_HEAD(&pq->pl, m, pageq); } mtx_unlock_spin(&vm_page_queue_free_mtx); vm_page_free_wakeup(); } /* * vm_page_unmanage: * * Prevent PV management from being done on the page. The page is * removed from the paging queues as if it were wired, and as a * consequence of no longer being managed the pageout daemon will not * touch it (since there is no way to locate the pte mappings for the * page). madvise() calls that mess with the pmap will also no longer * operate on the page. * * Beyond that the page is still reasonably 'normal'. Freeing the page * will clear the flag. * * This routine is used by OBJT_PHYS objects - objects using unswappable * physical memory as backing store rather then swap-backed memory and * will eventually be extended to support 4MB unmanaged physical * mappings. */ void vm_page_unmanage(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & PG_UNMANAGED) == 0) { if (m->wire_count == 0) vm_pageq_remove(m); } vm_page_flag_set(m, PG_UNMANAGED); } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * The page queues must be locked. * This routine may not block. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->flags & PG_FICTITIOUS) return; if (m->wire_count == 0) { if ((m->flags & PG_UNMANAGED) == 0) vm_pageq_remove(m); atomic_add_int(&cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of this page, potentially * enabling it to be paged again. * * Many pages placed on the inactive queue should actually go * into the cache, but it is difficult to figure out which. What * we do instead, if the inactive target is well met, is to put * clean pages at the head of the inactive queue instead of the tail. * This will cause them to be moved to the cache more quickly and * if not actively re-referenced, freed more quickly. If we just * stick these pages at the end of the inactive queue, heavy filesystem * meta-data accesses can cause an unnecessary paging load on memory bound * processes. This optimization causes one-time-use metadata to be * reused more quickly. * * BUT, if we are in a low-memory situation we have no choice but to * put clean pages on the cache queue. * * A number of routines use vm_page_unwire() to guarantee that the page * will go into either the inactive or active queues, and will NEVER * be placed in the cache - for example, just after dirtying a page. * dirty pages in the cache are not allowed. * * The page queues must be locked. * This routine may not block. */ void vm_page_unwire(vm_page_t m, int activate) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->flags & PG_FICTITIOUS) return; if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { atomic_subtract_int(&cnt.v_wire_count, 1); if (m->flags & PG_UNMANAGED) { ; } else if (activate) vm_pageq_enqueue(PQ_ACTIVE, m); else { vm_page_flag_clear(m, PG_WINATCFLS); vm_pageq_enqueue(PQ_INACTIVE, m); } } } else { panic("vm_page_unwire: invalid wire count: %d", m->wire_count); } } /* * Move the specified page to the inactive queue. If the page has * any associated swap, the swap is deallocated. * * Normally athead is 0 resulting in LRU operation. athead is set * to 1 if we want this page to be 'as if it were placed in the cache', * except without unmapping it from the process address space. * * This routine may not block. */ static __inline void _vm_page_deactivate(vm_page_t m, int athead) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Ignore if already inactive. */ if (m->queue == PQ_INACTIVE) return; if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { if ((m->queue - m->pc) == PQ_CACHE) cnt.v_reactivated++; vm_page_flag_clear(m, PG_WINATCFLS); vm_pageq_remove(m); if (athead) TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); else TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); m->queue = PQ_INACTIVE; vm_page_queues[PQ_INACTIVE].lcnt++; cnt.v_inactive_count++; } } void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, 0); } /* * vm_page_try_to_cache: * * Returns 0 on failure, 1 on success */ int vm_page_try_to_cache(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->dirty || m->hold_count || m->busy || m->wire_count || (m->flags & (PG_BUSY|PG_UNMANAGED))) { return (0); } pmap_remove_all(m); if (m->dirty) return (0); vm_page_cache(m); return (1); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->object != NULL) VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (m->dirty || m->hold_count || m->busy || m->wire_count || (m->flags & (PG_BUSY|PG_UNMANAGED))) { return (0); } pmap_remove_all(m); if (m->dirty) return (0); vm_page_busy(m); vm_page_free(m); return (1); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). * * This routine may not block. */ void vm_page_cache(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->hold_count || m->wire_count) { printf("vm_page_cache: attempting to cache busy page\n"); return; } if ((m->queue - m->pc) == PQ_CACHE) return; /* * Remove all pmaps and indicate that the page is not * writeable or mapped. */ pmap_remove_all(m); if (m->dirty != 0) { panic("vm_page_cache: caching a dirty page, pindex: %ld", (long)m->pindex); } vm_pageq_remove_nowakeup(m); vm_pageq_enqueue(PQ_CACHE + m->pc, m); vm_page_free_wakeup(); } /* * vm_page_dontneed * * Cache, deactivate, or do nothing as appropriate. This routine * is typically used by madvise() MADV_DONTNEED. * * Generally speaking we want to move the page into the cache so * it gets reused quickly. However, this can result in a silly syndrome * due to the page recycling too quickly. Small objects will not be * fully cached. On the otherhand, if we move the page to the inactive * queue we wind up with a problem whereby very large objects * unnecessarily blow away our inactive and cache queues. * * The solution is to move the pages based on a fixed weighting. We * either leave them alone, deactivate them, or move them to the cache, * where moving them to the cache has the highest weighting. * By forcing some pages into other queues we eventually force the * system to balance the queues, potentially recovering other unrelated * space from active. The idea is to not force this to happen too * often. */ void vm_page_dontneed(vm_page_t m) { static int dnweight; int dnw; int head; mtx_assert(&vm_page_queue_mtx, MA_OWNED); dnw = ++dnweight; /* * occassionally leave the page alone */ if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE || m->queue - m->pc == PQ_CACHE ) { if (m->act_count >= ACT_INIT) --m->act_count; return; } if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty || (dnw & 0x0070) == 0) { /* * Deactivate the page 3 times out of 32. */ head = 0; } else { /* * Cache the page 28 times out of every 32. Note that * the page is deactivated instead of cached, but placed * at the head of the queue instead of the tail. */ head = 1; } _vm_page_deactivate(m, head); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may block. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { vm_page_lock_queues(); if (m->busy || (m->flags & PG_BUSY)) { vm_page_flag_set(m, PG_WANTED | PG_REFERENCED); VM_OBJECT_UNLOCK(object); msleep(m, &vm_page_queue_mtx, PDROP | PVM, "pgrbwt", 0); VM_OBJECT_LOCK(object); if ((allocflags & VM_ALLOC_RETRY) == 0) return (NULL); goto retrylookup; } else { if (allocflags & VM_ALLOC_WIRED) vm_page_wire(m); vm_page_busy(m); vm_page_unlock_queues(); return (m); } } m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY); if (m == NULL) { VM_OBJECT_UNLOCK(object); VM_WAIT; VM_OBJECT_LOCK(object); if ((allocflags & VM_ALLOC_RETRY) == 0) return (NULL); goto retrylookup; } if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid bits or for dirty bits in * a page. May not block. * * Inputs are required to range within a page. */ __inline int vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return ((2 << last_bit) - (1 << first_bit)); } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * This routine may not block. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { int pagebits; int frag; int endoff; mtx_assert(&vm_page_queue_mtx, MA_OWNED); VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = base & ~(DEV_BSIZE - 1)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the PG_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif m->dirty &= ~pagebits; if (base == 0 && size == PAGE_SIZE) { pmap_clear_modify(m); vm_page_flag_clear(m, PG_NOSYNC); } } void vm_page_clear_dirty(vm_page_t m, int base, int size) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->dirty &= ~vm_page_bits(base, size); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. * * May not block. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { int bits; VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); bits = vm_page_bits(base, size); mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->valid &= ~bits; m->dirty &= ~bits; m->object->generation++; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); /* * Scan the valid bits looking for invalid sections that * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zerod by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & (1 << i)) ) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. * * May not block. */ int vm_page_is_valid(vm_page_t m, int base, int size) { int bits = vm_page_bits(base, size); VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); if (m->valid && ((m->valid & bits) == bits)) return 1; else return 0; } /* * update dirty bits from pmap/mmu. May not block. */ void vm_page_test_dirty(vm_page_t m) { if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) { vm_page_dirty(m); } } int so_zerocp_fullpage = 0; void vm_page_cowfault(vm_page_t m) { vm_page_t mnew; vm_object_t object; vm_pindex_t pindex; object = m->object; pindex = m->pindex; vm_page_busy(m); retry_alloc: vm_page_remove(m); /* * An interrupt allocation is requested because the page * queues lock is held. */ mnew = vm_page_alloc(object, pindex, VM_ALLOC_INTERRUPT); if (mnew == NULL) { vm_page_insert(m, object, pindex); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); VM_WAIT; VM_OBJECT_LOCK(object); vm_page_lock_queues(); goto retry_alloc; } if (m->cow == 0) { /* * check to see if we raced with an xmit complete when * waiting to allocate a page. If so, put things back * the way they were */ vm_page_busy(mnew); vm_page_free(mnew); vm_page_insert(m, object, pindex); } else { /* clear COW & copy page */ if (!so_zerocp_fullpage) pmap_copy_page(m, mnew); mnew->valid = VM_PAGE_BITS_ALL; vm_page_dirty(mnew); vm_page_flag_clear(mnew, PG_BUSY); } } void vm_page_cowclear(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->cow) { m->cow--; /* * let vm_fault add back write permission lazily */ } /* * sf_buf_free() will free the page, so we needn't do it here */ } void vm_page_cowsetup(vm_page_t m) { mtx_assert(&vm_page_queue_mtx, MA_OWNED); m->cow++; pmap_page_protect(m, VM_PROT_READ); } #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("cnt.v_free_count: %d\n", cnt.v_free_count); db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); db_printf("cnt.v_active_count: %d\n", cnt.v_active_count); db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); db_printf("cnt.v_free_min: %d\n", cnt.v_free_min); db_printf("cnt.v_free_target: %d\n", cnt.v_free_target); db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int i; db_printf("PQ_FREE:"); for (i = 0; i < PQ_L2_SIZE; i++) { db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt); } db_printf("\n"); db_printf("PQ_CACHE:"); for (i = 0; i < PQ_L2_SIZE; i++) { db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt); } db_printf("\n"); db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", vm_page_queues[PQ_ACTIVE].lcnt, vm_page_queues[PQ_INACTIVE].lcnt); } #endif /* DDB */